Index: test/CodeGen/aarch64-neon-2velem.c =================================================================== --- test/CodeGen/aarch64-neon-2velem.c +++ test/CodeGen/aarch64-neon-2velem.c @@ -1,2452 +1,5011 @@ -// REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmla_lane_s16 return vmla_lane_s16(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlaq_lane_s16 return vmlaq_lane_s16(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmla_lane_s32 return vmla_lane_s32(a, b, v, 1); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_s32 return vmlaq_lane_s32(a, b, v, 1); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmla_laneq_s16 return vmla_laneq_s16(a, b, v, 7); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlaq_laneq_s16 return vmlaq_laneq_s16(a, b, v, 7); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_s32 return vmla_laneq_s32(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_s32 return vmlaq_laneq_s32(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmls_lane_s16 return vmls_lane_s16(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsq_lane_s16 return vmlsq_lane_s16(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmls_lane_s32 return vmls_lane_s32(a, b, v, 1); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_s32 return vmlsq_lane_s32(a, b, v, 1); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmls_laneq_s16 return vmls_laneq_s16(a, b, v, 7); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsq_laneq_s16 return vmlsq_laneq_s16(a, b, v, 7); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_s32 return vmls_laneq_s32(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_s32 return vmlsq_laneq_s32(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vmul_lane_s16 return vmul_lane_s16(a, v, 3); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vmulq_lane_s16 return vmulq_lane_s16(a, v, 3); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vmul_lane_s32 return vmul_lane_s32(a, v, 1); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_s32 return vmulq_lane_s32(a, v, 1); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmul_lane_u16 return vmul_lane_u16(a, v, 3); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmulq_lane_u16 return vmulq_lane_u16(a, v, 3); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmul_lane_u32 return vmul_lane_u32(a, v, 1); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_u32 return vmulq_lane_u32(a, v, 1); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vmul_laneq_s16 return vmul_laneq_s16(a, v, 7); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vmulq_laneq_s16 return vmulq_laneq_s16(a, v, 7); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_s32 return vmul_laneq_s32(a, v, 3); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_s32 return vmulq_laneq_s32(a, v, 3); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmul_laneq_u16 return vmul_laneq_u16(a, v, 7); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmulq_laneq_u16 return vmulq_laneq_u16(a, v, 7); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_u32 return vmul_laneq_u32(a, v, 3); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_u32 return vmulq_laneq_u32(a, v, 3); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1) +// CHECK: ret <2 x float> [[FMLA]]2 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vfma_lane_f32 return vfma_lane_f32(a, b, v, 1); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1) +// CHECK: ret <4 x float> [[FMLA]]2 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vfmaq_lane_f32 return vfmaq_lane_f32(a, b, v, 1); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vfma_laneq_f32 return vfma_laneq_f32(a, b, v, 3); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmaq_laneq_f32 return vfmaq_laneq_f32(a, b, v, 3); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1) +// CHECK: ret <2 x float> [[FMLA]]2 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vfms_lane_f32 return vfms_lane_f32(a, b, v, 1); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1) +// CHECK: ret <4 x float> [[FMLA]]2 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vfmsq_lane_f32 return vfmsq_lane_f32(a, b, v, 1); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <4 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vfms_laneq_f32 return vfms_laneq_f32(a, b, v, 3); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <4 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmsq_laneq_f32 return vfmsq_laneq_f32(a, b, v, 3); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1) +// CHECK: ret <2 x double> [[FMLA]]2 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { - // CHECK-LABEL: test_vfmaq_lane_f64 return vfmaq_lane_f64(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { - // CHECK-LABEL: test_vfmaq_laneq_f64 return vfmaq_laneq_f64(a, b, v, 1); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <1 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA]]1) +// CHECK: ret <2 x double> [[FMLA]]2 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { - // CHECK-LABEL: test_vfmsq_lane_f64 return vfmsq_lane_f64(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { - // CHECK-LABEL: test_vfmsq_laneq_f64 return vfmsq_laneq_f64(a, b, v, 1); - // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) +// CHECK: ret float [[TMP2]] float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmas_laneq_f32 return vfmas_laneq_f32(a, b, v, 3); - // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <1 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) +// CHECK: ret double [[TMP2]] float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) { - // CHECK-LABEL: test_vfmsd_lane_f64 return vfmsd_lane_f64(a, b, v, 0); - // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}} } +// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <4 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) +// CHECK: ret float [[TMP2]] float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmss_laneq_f32 return vfmss_laneq_f32(a, b, v, 3); - // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) +// CHECK: ret double [[TMP2]] float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { - // CHECK-LABEL: test_vfmsd_laneq_f64 return vfmsd_laneq_f64(a, b, v, 1); - // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_lane_s16 return vmlal_lane_s16(a, b, v, 3); - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_lane_s32 return vmlal_lane_s32(a, b, v, 1); - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_laneq_s16 return vmlal_laneq_s16(a, b, v, 7); - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_laneq_s32 return vmlal_laneq_s32(a, b, v, 3); - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_high_lane_s16 return vmlal_high_lane_s16(a, b, v, 3); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_high_lane_s32 return vmlal_high_lane_s32(a, b, v, 1); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_s16 return vmlal_high_laneq_s16(a, b, v, 7); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_s32 return vmlal_high_laneq_s32(a, b, v, 3); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_lane_s16 return vmlsl_lane_s16(a, b, v, 3); - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_lane_s32 return vmlsl_lane_s32(a, b, v, 1); - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_laneq_s16 return vmlsl_laneq_s16(a, b, v, 7); - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_laneq_s32 return vmlsl_laneq_s32(a, b, v, 3); - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_s16 return vmlsl_high_lane_s16(a, b, v, 3); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_s32 return vmlsl_high_lane_s32(a, b, v, 1); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_s16 return vmlsl_high_laneq_s16(a, b, v, 7); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_s32 return vmlsl_high_laneq_s32(a, b, v, 3); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_lane_u16 return vmlal_lane_u16(a, b, v, 3); - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_lane_u32 return vmlal_lane_u32(a, b, v, 1); - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_laneq_u16 return vmlal_laneq_u16(a, b, v, 7); - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_laneq_u32 return vmlal_laneq_u32(a, b, v, 3); - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_high_lane_u16 return vmlal_high_lane_u16(a, b, v, 3); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_high_lane_u32 return vmlal_high_lane_u32(a, b, v, 1); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_u16 return vmlal_high_laneq_u16(a, b, v, 7); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_u32 return vmlal_high_laneq_u32(a, b, v, 3); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_lane_u16 return vmlsl_lane_u16(a, b, v, 3); - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_lane_u32 return vmlsl_lane_u32(a, b, v, 1); - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_laneq_u16 return vmlsl_laneq_u16(a, b, v, 7); - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_laneq_u32 return vmlsl_laneq_u32(a, b, v, 3); - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_u16 return vmlsl_high_lane_u16(a, b, v, 3); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_u32 return vmlsl_high_lane_u32(a, b, v, 1); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_u16 return vmlsl_high_laneq_u16(a, b, v, 7); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_u32 return vmlsl_high_laneq_u32(a, b, v, 3); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vmull_lane_s16 return vmull_lane_s16(a, v, 3); - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vmull_lane_s32 return vmull_lane_s32(a, v, 1); - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmull_lane_u16 return vmull_lane_u16(a, v, 3); - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmull_lane_u32 return vmull_lane_u32(a, v, 1); - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vmull_high_lane_s16 return vmull_high_lane_s16(a, v, 3); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vmull_high_lane_s32 return vmull_high_lane_s32(a, v, 1); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmull_high_lane_u16 return vmull_high_lane_u16(a, v, 3); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmull_high_lane_u32 return vmull_high_lane_u32(a, v, 1); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vmull_laneq_s16 return vmull_laneq_s16(a, v, 7); - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vmull_laneq_s32 return vmull_laneq_s32(a, v, 3); - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmull_laneq_u16 return vmull_laneq_u16(a, v, 7); - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmull_laneq_u32 return vmull_laneq_u32(a, v, 3); - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vmull_high_laneq_s16 return vmull_high_laneq_s16(a, v, 7); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vmull_high_laneq_s32 return vmull_high_laneq_s32(a, v, 3); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmull_high_laneq_u16 return vmull_high_laneq_u16(a, v, 7); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmull_high_laneq_u32 return vmull_high_laneq_u32(a, v, 3); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlal_lane_s16 return vqdmlal_lane_s16(a, b, v, 3); - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlal_lane_s32 return vqdmlal_lane_s32(a, b, v, 1); - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlal_high_lane_s16 return vqdmlal_high_lane_s16(a, b, v, 3); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlal_high_lane_s32 return vqdmlal_high_lane_s32(a, b, v, 1); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlsl_lane_s16 return vqdmlsl_lane_s16(a, b, v, 3); - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlsl_lane_s32 return vqdmlsl_lane_s32(a, b, v, 1); - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlsl_high_lane_s16 return vqdmlsl_high_lane_s16(a, b, v, 3); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlsl_high_lane_s32 return vqdmlsl_high_lane_s32(a, b, v, 1); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmull_lane_s16 return vqdmull_lane_s16(a, v, 3); - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmull_lane_s32 return vqdmull_lane_s32(a, v, 1); - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmull_laneq_s16 return vqdmull_laneq_s16(a, v, 3); - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmull_laneq_s32 return vqdmull_laneq_s32(a, v, 3); - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmull_high_lane_s16 return vqdmull_high_lane_s16(a, v, 3); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmull_high_lane_s32 return vqdmull_high_lane_s32(a, v, 1); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmull_high_laneq_s16 return vqdmull_high_laneq_s16(a, v, 7); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmull_high_laneq_s32 return vqdmull_high_laneq_s32(a, v, 3); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmulh_lane_s16 return vqdmulh_lane_s16(a, v, 3); - // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmulhq_lane_s16 return vqdmulhq_lane_s16(a, v, 3); - // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmulh_lane_s32 return vqdmulh_lane_s32(a, v, 1); - // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmulhq_lane_s32 return vqdmulhq_lane_s32(a, v, 1); - // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqrdmulh_lane_s16 return vqrdmulh_lane_s16(a, v, 3); - // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqrdmulhq_lane_s16 return vqrdmulhq_lane_s16(a, v, 3); - // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqrdmulh_lane_s32 return vqrdmulh_lane_s32(a, v, 1); - // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqrdmulhq_lane_s32 return vqrdmulhq_lane_s32(a, v, 1); - // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { - // CHECK-LABEL: test_vmul_lane_f32 return vmul_lane_f32(a, v, 1); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 +// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] +// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> +// CHECK: ret <1 x double> [[TMP5]] float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { - // CHECK-LABEL: test_vmul_lane_f64 return vmul_lane_f64(a, v, 0); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}} } +// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_f32 return vmulq_lane_f32(a, v, 1); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] +// CHECK: ret <2 x double> [[MUL]] float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { - // CHECK-LABEL: test_vmulq_lane_f64 return vmulq_lane_f64(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_f32 return vmul_laneq_f32(a, v, 3); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] +// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> +// CHECK: ret <1 x double> [[TMP5]] float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { - // CHECK-LABEL: test_vmul_laneq_f64 return vmul_laneq_f64(a, v, 1); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_f32 return vmulq_laneq_f32(a, v, 3); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] +// CHECK: ret <2 x double> [[MUL]] float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { - // CHECK-LABEL: test_vmulq_laneq_f64 return vmulq_laneq_f64(a, v, 1); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 +// CHECK: ret <2 x float> [[VMULX2_I]] float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulx_lane_f32 return vmulx_lane_f32(a, v, 1); - // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 +// CHECK: ret <4 x float> [[VMULX2_I]] float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulxq_lane_f32 return vmulxq_lane_f32(a, v, 1); - // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 +// CHECK: ret <2 x double> [[VMULX2_I]] float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { - // CHECK-LABEL: test_vmulxq_lane_f64 return vmulxq_lane_f64(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 +// CHECK: ret <2 x float> [[VMULX2_I]] float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulx_laneq_f32 return vmulx_laneq_f32(a, v, 3); - // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 +// CHECK: ret <4 x float> [[VMULX2_I]] float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulxq_laneq_f32 return vmulxq_laneq_f32(a, v, 3); - // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 +// CHECK: ret <2 x double> [[VMULX2_I]] float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { - // CHECK-LABEL: test_vmulxq_laneq_f64 return vmulxq_laneq_f64(a, v, 1); - // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmla_lane_s16_0 return vmla_lane_s16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlaq_lane_s16_0 return vmlaq_lane_s16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmla_lane_s32_0 return vmla_lane_s32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_s32_0 return vmlaq_lane_s32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmla_laneq_s16_0 return vmla_laneq_s16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlaq_laneq_s16_0 return vmlaq_laneq_s16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_s32_0 return vmla_laneq_s32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_s32_0 return vmlaq_laneq_s32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmls_lane_s16_0 return vmls_lane_s16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsq_lane_s16_0 return vmlsq_lane_s16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmls_lane_s32_0 return vmls_lane_s32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_s32_0 return vmlsq_lane_s32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmls_laneq_s16_0 return vmls_laneq_s16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsq_laneq_s16_0 return vmlsq_laneq_s16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_s32_0 return vmls_laneq_s32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_s32_0 return vmlsq_laneq_s32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vmul_lane_s16_0 return vmul_lane_s16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vmulq_lane_s16_0 return vmulq_lane_s16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vmul_lane_s32_0 return vmul_lane_s32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_s32_0 return vmulq_lane_s32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmul_lane_u16_0 return vmul_lane_u16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmulq_lane_u16_0 return vmulq_lane_u16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmul_lane_u32_0 return vmul_lane_u32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_u32_0 return vmulq_lane_u32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vmul_laneq_s16_0 return vmul_laneq_s16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vmulq_laneq_s16_0 return vmulq_laneq_s16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_s32_0 return vmul_laneq_s32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_s32_0 return vmulq_laneq_s32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmul_laneq_u16_0 return vmul_laneq_u16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmulq_laneq_u16_0 return vmulq_laneq_u16(a, v, 0); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_u32_0 return vmul_laneq_u32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_u32_0 return vmulq_laneq_u32(a, v, 0); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1) +// CHECK: ret <2 x float> [[FMLA]]2 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vfma_lane_f32_0 return vfma_lane_f32(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1) +// CHECK: ret <4 x float> [[FMLA]]2 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vfmaq_lane_f32_0 return vfmaq_lane_f32(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vfma_laneq_f32_0 return vfma_laneq_f32(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmaq_laneq_f32_0 return vfmaq_laneq_f32(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA]]1) +// CHECK: ret <2 x float> [[FMLA]]2 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vfms_lane_f32_0 return vfms_lane_f32(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA]]1) +// CHECK: ret <4 x float> [[FMLA]]2 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vfmsq_lane_f32_0 return vfmsq_lane_f32(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <4 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vfms_laneq_f32_0 return vfms_laneq_f32(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <4 x float> , %v +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vfmsq_laneq_f32_0 return vfmsq_laneq_f32(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { - // CHECK-LABEL: test_vfmaq_laneq_f64_0 return vfmaq_laneq_f64(a, b, v, 0); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { - // CHECK-LABEL: test_vfmsq_laneq_f64_0 return vfmsq_laneq_f64(a, b, v, 0); - // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_lane_s16_0 return vmlal_lane_s16(a, b, v, 0); - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_lane_s32_0 return vmlal_lane_s32(a, b, v, 0); - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_laneq_s16_0 return vmlal_laneq_s16(a, b, v, 0); - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_laneq_s32_0 return vmlal_laneq_s32(a, b, v, 0); - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_high_lane_s16_0 return vmlal_high_lane_s16(a, b, v, 0); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_high_lane_s32_0 return vmlal_high_lane_s32(a, b, v, 0); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_s16_0 return vmlal_high_laneq_s16(a, b, v, 0); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_s32_0 return vmlal_high_laneq_s32(a, b, v, 0); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_lane_s16_0 return vmlsl_lane_s16(a, b, v, 0); - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_lane_s32_0 return vmlsl_lane_s32(a, b, v, 0); - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_laneq_s16_0 return vmlsl_laneq_s16(a, b, v, 0); - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_laneq_s32_0 return vmlsl_laneq_s32(a, b, v, 0); - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_s16_0 return vmlsl_high_lane_s16(a, b, v, 0); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_s32_0 return vmlsl_high_lane_s32(a, b, v, 0); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_s16_0 return vmlsl_high_laneq_s16(a, b, v, 0); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_s32_0 return vmlsl_high_laneq_s32(a, b, v, 0); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_lane_u16_0 return vmlal_lane_u16(a, b, v, 0); - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_lane_u32_0 return vmlal_lane_u32(a, b, v, 0); - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_laneq_u16_0 return vmlal_laneq_u16(a, b, v, 0); - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_laneq_u32_0 return vmlal_laneq_u32(a, b, v, 0); - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlal_high_lane_u16_0 return vmlal_high_lane_u16(a, b, v, 0); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlal_high_lane_u32_0 return vmlal_high_lane_u32(a, b, v, 0); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_u16_0 return vmlal_high_laneq_u16(a, b, v, 0); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlal_high_laneq_u32_0 return vmlal_high_laneq_u32(a, b, v, 0); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_lane_u16_0 return vmlsl_lane_u16(a, b, v, 0); - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_lane_u32_0 return vmlsl_lane_u32(a, b, v, 0); - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_laneq_u16_0 return vmlsl_laneq_u16(a, b, v, 0); - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_laneq_u32_0 return vmlsl_laneq_u32(a, b, v, 0); - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_u16_0 return vmlsl_high_lane_u16(a, b, v, 0); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vmlsl_high_lane_u32_0 return vmlsl_high_lane_u32(a, b, v, 0); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_u16_0 return vmlsl_high_laneq_u16(a, b, v, 0); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vmlsl_high_laneq_u32_0 return vmlsl_high_laneq_u32(a, b, v, 0); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vmull_lane_s16_0 return vmull_lane_s16(a, v, 0); - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vmull_lane_s32_0 return vmull_lane_s32(a, v, 0); - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmull_lane_u16_0 return vmull_lane_u16(a, v, 0); - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmull_lane_u32_0 return vmull_lane_u32(a, v, 0); - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vmull_high_lane_s16_0 return vmull_high_lane_s16(a, v, 0); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vmull_high_lane_s32_0 return vmull_high_lane_s32(a, v, 0); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { - // CHECK-LABEL: test_vmull_high_lane_u16_0 return vmull_high_lane_u16(a, v, 0); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { - // CHECK-LABEL: test_vmull_high_lane_u32_0 return vmull_high_lane_u32(a, v, 0); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vmull_laneq_s16_0 return vmull_laneq_s16(a, v, 0); - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vmull_laneq_s32_0 return vmull_laneq_s32(a, v, 0); - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmull_laneq_u16_0 return vmull_laneq_u16(a, v, 0); - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmull_laneq_u32_0 return vmull_laneq_u32(a, v, 0); - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vmull_high_laneq_s16_0 return vmull_high_laneq_s16(a, v, 0); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vmull_high_laneq_s32_0 return vmull_high_laneq_s32(a, v, 0); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { - // CHECK-LABEL: test_vmull_high_laneq_u16_0 return vmull_high_laneq_u16(a, v, 0); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { - // CHECK-LABEL: test_vmull_high_laneq_u32_0 return vmull_high_laneq_u32(a, v, 0); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlal_lane_s16_0 return vqdmlal_lane_s16(a, b, v, 0); - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlal_lane_s32_0 return vqdmlal_lane_s32(a, b, v, 0); - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlal_high_lane_s16_0 return vqdmlal_high_lane_s16(a, b, v, 0); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlal_high_lane_s32_0 return vqdmlal_high_lane_s32(a, b, v, 0); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlsl_lane_s16_0 return vqdmlsl_lane_s16(a, b, v, 0); - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlsl_lane_s32_0 return vqdmlsl_lane_s32(a, b, v, 0); - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { - // CHECK-LABEL: test_vqdmlsl_high_lane_s16_0 return vqdmlsl_high_lane_s16(a, b, v, 0); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { - // CHECK-LABEL: test_vqdmlsl_high_lane_s32_0 return vqdmlsl_high_lane_s32(a, b, v, 0); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmull_lane_s16_0 return vqdmull_lane_s16(a, v, 0); - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmull_lane_s32_0 return vqdmull_lane_s32(a, v, 0); - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmull_laneq_s16_0 return vqdmull_laneq_s16(a, v, 0); - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmull_laneq_s32_0 return vqdmull_laneq_s32(a, v, 0); - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmull_high_lane_s16_0 return vqdmull_high_lane_s16(a, v, 0); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmull_high_lane_s32_0 return vqdmull_high_lane_s32(a, v, 0); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmull_high_laneq_s16_0 return vqdmull_high_laneq_s16(a, v, 0); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmull_high_laneq_s32_0 return vqdmull_high_laneq_s32(a, v, 0); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmulh_lane_s16_0 return vqdmulh_lane_s16(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqdmulhq_lane_s16_0 return vqdmulhq_lane_s16(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmulh_lane_s32_0 return vqdmulh_lane_s32(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqdmulhq_lane_s32_0 return vqdmulhq_lane_s32(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { - // CHECK-LABEL: test_vqrdmulh_lane_s16_0 return vqrdmulh_lane_s16(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { - // CHECK-LABEL: test_vqrdmulhq_lane_s16_0 return vqrdmulhq_lane_s16(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { - // CHECK-LABEL: test_vqrdmulh_lane_s32_0 return vqrdmulh_lane_s32(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { - // CHECK-LABEL: test_vqrdmulhq_lane_s32_0 return vqrdmulhq_lane_s32(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { - // CHECK-LABEL: test_vmul_lane_f32_0 return vmul_lane_f32(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulq_lane_f32_0 return vmulq_lane_f32(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { - // CHECK-LABEL: test_vmul_laneq_f32_0 return vmul_laneq_f32(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] +// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> +// CHECK: ret <1 x double> [[TMP5]] float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { - // CHECK-LABEL: test_vmul_laneq_f64_0 return vmul_laneq_f64(a, v, 0); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulq_laneq_f32_0 return vmulq_laneq_f32(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] +// CHECK: ret <2 x double> [[MUL]] float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { - // CHECK-LABEL: test_vmulq_laneq_f64_0 return vmulq_laneq_f64(a, v, 0); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 +// CHECK: ret <2 x float> [[VMULX2_I]] float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulx_lane_f32_0 return vmulx_lane_f32(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 +// CHECK: ret <4 x float> [[VMULX2_I]] float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { - // CHECK-LABEL: test_vmulxq_lane_f32_0 return vmulxq_lane_f32(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 +// CHECK: ret <2 x double> [[VMULX2_I]] float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { - // CHECK-LABEL: test_vmulxq_lane_f64_0 return vmulxq_lane_f64(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 +// CHECK: ret <2 x float> [[VMULX2_I]] float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulx_laneq_f32_0 return vmulx_laneq_f32(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 +// CHECK: ret <4 x float> [[VMULX2_I]] float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { - // CHECK-LABEL: test_vmulxq_laneq_f32_0 return vmulxq_laneq_f32(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 +// CHECK: ret <2 x double> [[VMULX2_I]] float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { - // CHECK-LABEL: test_vmulxq_laneq_f64_0 return vmulxq_laneq_f64(a, v, 0); - // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL5_I_I]] int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { - // CHECK-LABEL: test_vmull_high_n_s16 return vmull_high_n_s16(a, b); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL3_I_I]] int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { - // CHECK-LABEL: test_vmull_high_n_s32 return vmull_high_n_s32(a, b); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL5_I_I]] uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { - // CHECK-LABEL: test_vmull_high_n_u16 return vmull_high_n_u16(a, b); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL3_I_I]] uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { - // CHECK-LABEL: test_vmull_high_n_u32 return vmull_high_n_u32(a, b); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2 +// CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { - // CHECK-LABEL: test_vqdmull_high_n_s16 return vqdmull_high_n_s16(a, b); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2 +// CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { - // CHECK-LABEL: test_vqdmull_high_n_s32 return vqdmull_high_n_s32(a, b); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vmlal_high_n_s16 return vmlal_high_n_s16(a, b, c); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vmlal_high_n_s32 return vmlal_high_n_s32(a, b, c); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { - // CHECK-LABEL: test_vmlal_high_n_u16 return vmlal_high_n_u16(a, b, c); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { - // CHECK-LABEL: test_vmlal_high_n_u32 return vmlal_high_n_u32(a, b, c); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2 +// CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V6_I_I]] int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vqdmlal_high_n_s16 return vqdmlal_high_n_s16(a, b, c); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2 +// CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V4_I_I]] int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vqdmlal_high_n_s32 return vqdmlal_high_n_s32(a, b, c); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 +// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[SUB_I_I]] int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vmlsl_high_n_s16 return vmlsl_high_n_s16(a, b, c); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 +// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[SUB_I_I]] int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vmlsl_high_n_s32 return vmlsl_high_n_s32(a, b, c); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 +// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[SUB_I_I]] uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { - // CHECK-LABEL: test_vmlsl_high_n_u16 return vmlsl_high_n_u16(a, b, c); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 +// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[SUB_I_I]] uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { - // CHECK-LABEL: test_vmlsl_high_n_u32 return vmlsl_high_n_u32(a, b, c); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2 +// CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V6_I_I]] int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vqdmlsl_high_n_s16 return vqdmlsl_high_n_s16(a, b, c); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}} } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2 +// CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V4_I_I]] int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vqdmlsl_high_n_s32 return vqdmlsl_high_n_s32(a, b, c); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}} } +// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]] +// CHECK: ret <2 x float> [[MUL_I]] float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { - // CHECK-LABEL: test_vmul_n_f32 return vmul_n_f32(a, b); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]] +// CHECK: ret <4 x float> [[MUL_I]] float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { - // CHECK-LABEL: test_vmulq_n_f32 return vmulq_n_f32(a, b); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]] +// CHECK: ret <2 x double> [[MUL_I]] float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) { - // CHECK-LABEL: test_vmulq_n_f64 return vmulq_n_f64(a, b); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2 +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { - // CHECK-LABEL: test_vfma_n_f32 return vfma_n_f32(a, b, n); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2 +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { - // CHECK-LABEL: test_vfmaq_n_f32 return vfmaq_n_f32(a, b, n); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP4:%.*]] = fsub <2 x float> , [[TMP3]] +// CHECK: [[FMLS_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[FMLS1_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLS2_I_I:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLS_I_I]], <2 x float> [[TMP4]], <2 x float> [[FMLS1_I_I]]) #2 +// CHECK: ret <2 x float> [[FMLS2_I_I]] float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { - // CHECK-LABEL: test_vfms_n_f32 return vfms_n_f32(a, b, n); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP4:%.*]] = fsub <4 x float> , [[TMP3]] +// CHECK: [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLS2_I_I:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLS_I_I]], <4 x float> [[TMP4]], <4 x float> [[FMLS1_I_I]]) #2 +// CHECK: ret <4 x float> [[FMLS2_I_I]] float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { - // CHECK-LABEL: test_vfmsq_n_f32 return vfmsq_n_f32(a, b, n); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i16> [[MUL_I]] int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { - // CHECK-LABEL: test_vmul_n_s16 return vmul_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] +// CHECK: ret <8 x i16> [[MUL_I]] int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { - // CHECK-LABEL: test_vmulq_n_s16 return vmulq_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] +// CHECK: ret <2 x i32> [[MUL_I]] int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { - // CHECK-LABEL: test_vmul_n_s32 return vmul_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i32> [[MUL_I]] int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { - // CHECK-LABEL: test_vmulq_n_s32 return vmulq_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i16> [[MUL_I]] uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { - // CHECK-LABEL: test_vmul_n_u16 return vmul_n_u16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] +// CHECK: ret <8 x i16> [[MUL_I]] uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { - // CHECK-LABEL: test_vmulq_n_u16 return vmulq_n_u16(a, b); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] +// CHECK: ret <2 x i32> [[MUL_I]] uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { - // CHECK-LABEL: test_vmul_n_u32 return vmul_n_u32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i32> [[MUL_I]] uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { - // CHECK-LABEL: test_vmulq_n_u32 return vmulq_n_u32(a, b); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL5_I]] int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { - // CHECK-LABEL: test_vmull_n_s16 return vmull_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL3_I]] int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { - // CHECK-LABEL: test_vmull_n_s32 return vmull_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2 +// CHECK: ret <4 x i32> [[VMULL5_I]] uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { - // CHECK-LABEL: test_vmull_n_u16 return vmull_n_u16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2 +// CHECK: ret <2 x i64> [[VMULL3_I]] uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { - // CHECK-LABEL: test_vmull_n_u32 return vmull_n_u32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2 +// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { - // CHECK-LABEL: test_vqdmull_n_s16 return vqdmull_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2 +// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { - // CHECK-LABEL: test_vqdmull_n_s32 return vqdmull_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2 +// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { - // CHECK-LABEL: test_vqdmulh_n_s16 return vqdmulh_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2 +// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { - // CHECK-LABEL: test_vqdmulhq_n_s16 return vqdmulhq_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2 +// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { - // CHECK-LABEL: test_vqdmulh_n_s32 return vqdmulh_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2 +// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { - // CHECK-LABEL: test_vqdmulhq_n_s32 return vqdmulhq_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2 +// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { - // CHECK-LABEL: test_vqrdmulh_n_s16 return vqrdmulh_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2 +// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { - // CHECK-LABEL: test_vqrdmulhq_n_s16 return vqrdmulhq_n_s16(a, b); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2 +// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { - // CHECK-LABEL: test_vqrdmulh_n_s32 return vqrdmulh_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2 +// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { - // CHECK-LABEL: test_vqrdmulhq_n_s32 return vqrdmulhq_n_s32(a, b); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vmla_n_s16 return vmla_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vmlaq_n_s16 return vmlaq_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vmla_n_s32 return vmla_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vmlaq_n_s32 return vmlaq_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { - // CHECK-LABEL: test_vmla_n_u16 return vmla_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { - // CHECK-LABEL: test_vmlaq_n_u16 return vmlaq_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { - // CHECK-LABEL: test_vmla_n_u32 return vmla_n_u32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { - // CHECK-LABEL: test_vmlaq_n_u32 return vmlaq_n_u32(a, b, c); - // CHECK: dup {{v[0-9]+}}.4s, w0 - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vmlal_n_s16 return vmlal_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vmlal_n_s32 return vmlal_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { - // CHECK-LABEL: test_vmlal_n_u16 return vmlal_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { - // CHECK-LABEL: test_vmlal_n_u32 return vmlal_n_u32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]] int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vqdmlal_n_s16 return vqdmlal_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]] int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vqdmlal_n_s32 return vqdmlal_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vmls_n_s16 return vmls_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { - // CHECK-LABEL: test_vmlsq_n_s16 return vmlsq_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vmls_n_s32 return vmls_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { - // CHECK-LABEL: test_vmlsq_n_s32 return vmlsq_n_s32(a, b, c); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { - // CHECK-LABEL: test_vmls_n_u16 return vmls_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { - // CHECK-LABEL: test_vmlsq_n_u16 return vmlsq_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.8h, w0 - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { - // CHECK-LABEL: test_vmls_n_u32 return vmls_n_u32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { - // CHECK-LABEL: test_vmlsq_n_u32 return vmlsq_n_u32(a, b, c); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vmlsl_n_s16 return vmlsl_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vmlsl_n_s32 return vmlsl_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { - // CHECK-LABEL: test_vmlsl_n_u16 return vmlsl_n_u16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { - // CHECK-LABEL: test_vmlsl_n_u32 return vmlsl_n_u32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]] int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { - // CHECK-LABEL: test_vqdmlsl_n_s16 return vqdmlsl_n_s16(a, b, c); - // CHECK: dup {{v[0-9]+}}.4h, w0 - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]] int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { - // CHECK-LABEL: test_vqdmlsl_n_s32 return vqdmlsl_n_s32(a, b, c); - // CHECK: dup {{v[0-9]+}}.2s, w0 - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmla_lane_u16_0 return vmla_lane_u16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmlaq_lane_u16_0 return vmlaq_lane_u16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmla_lane_u32_0 return vmla_lane_u32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_u32_0 return vmlaq_lane_u32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmla_laneq_u16_0 return vmla_laneq_u16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmlaq_laneq_u16_0 return vmlaq_laneq_u16(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_u32_0 return vmla_laneq_u32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_u32_0 return vmlaq_laneq_u32(a, b, v, 0); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlal_laneq_s16_0 return vqdmlal_laneq_s16(a, b, v, 0); - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlal_laneq_s32_0 return vqdmlal_laneq_s32(a, b, v, 0); - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlal_high_laneq_s16_0 return vqdmlal_high_laneq_s16(a, b, v, 0); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlal_high_laneq_s32_0 return vqdmlal_high_laneq_s32(a, b, v, 0); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmls_lane_u16_0 return vmls_lane_u16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmlsq_lane_u16_0 return vmlsq_lane_u16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmls_lane_u32_0 return vmls_lane_u32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_u32_0 return vmlsq_lane_u32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmls_laneq_u16_0 return vmls_laneq_u16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmlsq_laneq_u16_0 return vmlsq_laneq_u16(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_u32_0 return vmls_laneq_u32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_u32_0 return vmlsq_laneq_u32(a, b, v, 0); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlsl_laneq_s16_0 return vqdmlsl_laneq_s16(a, b, v, 0); - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlsl_laneq_s32_0 return vqdmlsl_laneq_s32(a, b, v, 0); - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlsl_high_laneq_s16_0 return vqdmlsl_high_laneq_s16(a, b, v, 0); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlsl_high_laneq_s32_0 return vqdmlsl_high_laneq_s32(a, b, v, 0); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmulh_laneq_s16_0 return vqdmulh_laneq_s16(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmulhq_laneq_s16_0 return vqdmulhq_laneq_s16(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmulh_laneq_s32_0 return vqdmulh_laneq_s32(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmulhq_laneq_s32_0 return vqdmulhq_laneq_s32(a, v, 0); - // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqrdmulh_laneq_s16_0 return vqrdmulh_laneq_s16(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqrdmulhq_laneq_s16_0 return vqrdmulhq_laneq_s16(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqrdmulh_laneq_s32_0 return vqrdmulh_laneq_s32(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqrdmulhq_laneq_s32_0 return vqrdmulhq_laneq_s32(a, v, 0); - // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmla_lane_u16 return vmla_lane_u16(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmlaq_lane_u16 return vmlaq_lane_u16(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmla_lane_u32 return vmla_lane_u32(a, b, v, 1); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_u32 return vmlaq_lane_u32(a, b, v, 1); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmla_laneq_u16 return vmla_laneq_u16(a, b, v, 7); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmlaq_laneq_u16 return vmlaq_laneq_u16(a, b, v, 7); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_u32 return vmla_laneq_u32(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_u32 return vmlaq_laneq_u32(a, b, v, 3); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlal_laneq_s16 return vqdmlal_laneq_s16(a, b, v, 7); - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlal_laneq_s32 return vqdmlal_laneq_s32(a, b, v, 3); - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlal_high_laneq_s16 return vqdmlal_high_laneq_s16(a, b, v, 7); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlal_high_laneq_s32 return vqdmlal_high_laneq_s32(a, b, v, 3); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmls_lane_u16 return vmls_lane_u16(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { - // CHECK-LABEL: test_vmlsq_lane_u16 return vmlsq_lane_u16(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] } +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmls_lane_u32 return vmls_lane_u32(a, b, v, 1); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_u32 return vmlsq_lane_u32(a, b, v, 1); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmls_laneq_u16 return vmls_laneq_u16(a, b, v, 7); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { - // CHECK-LABEL: test_vmlsq_laneq_u16 return vmlsq_laneq_u16(a, b, v, 7); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_u32 return vmls_laneq_u32(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_u32 return vmlsq_laneq_u32(a, b, v, 3); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlsl_laneq_s16 return vqdmlsl_laneq_s16(a, b, v, 7); - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlsl_laneq_s32 return vqdmlsl_laneq_s32(a, b, v, 3); - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { - // CHECK-LABEL: test_vqdmlsl_high_laneq_s16 return vqdmlsl_high_laneq_s16(a, b, v, 7); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { - // CHECK-LABEL: test_vqdmlsl_high_laneq_s32 return vqdmlsl_high_laneq_s32(a, b, v, 3); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmulh_laneq_s16 return vqdmulh_laneq_s16(a, v, 7); - // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqdmulhq_laneq_s16 return vqdmulhq_laneq_s16(a, v, 7); - // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmulh_laneq_s32 return vqdmulh_laneq_s32(a, v, 3); - // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqdmulhq_laneq_s32 return vqdmulhq_laneq_s32(a, v, 3); - // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { - // CHECK-LABEL: test_vqrdmulh_laneq_s16 return vqrdmulh_laneq_s16(a, v, 7); - // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { - // CHECK-LABEL: test_vqrdmulhq_laneq_s16 return vqrdmulhq_laneq_s16(a, v, 7); - // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { - // CHECK-LABEL: test_vqrdmulh_laneq_s32 return vqrdmulh_laneq_s32(a, v, 3); - // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { - // CHECK-LABEL: test_vqrdmulhq_laneq_s32 return vqrdmulhq_laneq_s32(a, v, 3); - // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } Index: test/CodeGen/aarch64-neon-3v.c =================================================================== --- test/CodeGen/aarch64-neon-3v.c +++ test/CodeGen/aarch64-neon-3v.c @@ -1,486 +1,597 @@ -// REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[AND_I]] int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vand_s8 return vand_s8(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[AND_I]] int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vandq_s8 return vandq_s8(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[AND_I]] int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vand_s16 return vand_s16(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[AND_I]] int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vandq_s16 return vandq_s16(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[AND_I]] int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vand_s32 return vand_s32(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[AND_I]] int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vandq_s32 return vandq_s32(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[AND_I]] int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vand_s64 return vand_s64(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[AND_I]] int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vandq_s64 return vandq_s64(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[AND_I]] uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vand_u8 return vand_u8(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[AND_I]] uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vandq_u8 return vandq_u8(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[AND_I]] uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vand_u16 return vand_u16(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[AND_I]] uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vandq_u16 return vandq_u16(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[AND_I]] uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vand_u32 return vand_u32(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[AND_I]] uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vandq_u32 return vandq_u32(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[AND_I]] uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vand_u64 return vand_u64(a, b); - // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[AND_I]] uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vandq_u64 return vandq_u64(a, b); - // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[OR_I]] int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vorr_s8 return vorr_s8(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[OR_I]] int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vorrq_s8 return vorrq_s8(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[OR_I]] int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vorr_s16 return vorr_s16(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[OR_I]] int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vorrq_s16 return vorrq_s16(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[OR_I]] int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vorr_s32 return vorr_s32(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[OR_I]] int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vorrq_s32 return vorrq_s32(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[OR_I]] int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vorr_s64 return vorr_s64(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[OR_I]] int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vorrq_s64 return vorrq_s64(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[OR_I]] uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vorr_u8 return vorr_u8(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[OR_I]] uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vorrq_u8 return vorrq_u8(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[OR_I]] uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vorr_u16 return vorr_u16(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[OR_I]] uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vorrq_u16 return vorrq_u16(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[OR_I]] uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vorr_u32 return vorr_u32(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[OR_I]] uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vorrq_u32 return vorrq_u32(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[OR_I]] uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vorr_u64 return vorr_u64(a, b); - // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[OR_I]] uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vorrq_u64 return vorrq_u64(a, b); - // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[XOR_I]] int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_veor_s8 return veor_s8(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[XOR_I]] int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_veorq_s8 return veorq_s8(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[XOR_I]] int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_veor_s16 return veor_s16(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[XOR_I]] int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_veorq_s16 return veorq_s16(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[XOR_I]] int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_veor_s32 return veor_s32(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[XOR_I]] int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_veorq_s32 return veorq_s32(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[XOR_I]] int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_veor_s64 return veor_s64(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[XOR_I]] int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_veorq_s64 return veorq_s64(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[XOR_I]] uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_veor_u8 return veor_u8(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[XOR_I]] uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_veorq_u8 return veorq_u8(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[XOR_I]] uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_veor_u16 return veor_u16(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[XOR_I]] uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_veorq_u16 return veorq_u16(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[XOR_I]] uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_veor_u32 return veor_u32(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[XOR_I]] uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_veorq_u32 return veorq_u32(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[XOR_I]] uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_veor_u64 return veor_u64(a, b); - // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[XOR_I]] uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_veorq_u64 return veorq_u64(a, b); - // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[AND_I]] int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vbic_s8 return vbic_s8(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[AND_I]] int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vbicq_s8 return vbicq_s8(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[AND_I]] int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vbic_s16 return vbic_s16(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[AND_I]] int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vbicq_s16 return vbicq_s16(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[AND_I]] int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vbic_s32 return vbic_s32(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[AND_I]] int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vbicq_s32 return vbicq_s32(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[AND_I]] int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vbic_s64 return vbic_s64(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[AND_I]] int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vbicq_s64 return vbicq_s64(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[AND_I]] uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vbic_u8 return vbic_u8(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[AND_I]] uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vbicq_u8 return vbicq_u8(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[AND_I]] uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vbic_u16 return vbic_u16(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[AND_I]] uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vbicq_u16 return vbicq_u16(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[AND_I]] uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vbic_u32 return vbic_u32(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[AND_I]] uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vbicq_u32 return vbicq_u32(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[AND_I]] uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vbic_u64 return vbic_u64(a, b); - // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[AND_I]] uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vbicq_u64 return vbicq_u64(a, b); - // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[OR_I]] int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vorn_s8 return vorn_s8(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[OR_I]] int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vornq_s8 return vornq_s8(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[OR_I]] int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vorn_s16 return vorn_s16(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[OR_I]] int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vornq_s16 return vornq_s16(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[OR_I]] int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vorn_s32 return vorn_s32(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[OR_I]] int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vornq_s32 return vornq_s32(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[OR_I]] int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vorn_s64 return vorn_s64(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[OR_I]] int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vornq_s64 return vornq_s64(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[OR_I]] uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vorn_u8 return vorn_u8(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[OR_I]] uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vornq_u8 return vornq_u8(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[OR_I]] uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vorn_u16 return vorn_u16(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[OR_I]] uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vornq_u16 return vornq_u16(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[OR_I]] uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vorn_u32 return vorn_u32(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[OR_I]] uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vornq_u32 return vornq_u32(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[OR_I]] uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vorn_u64 return vorn_u64(a, b); - // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[OR_I]] uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vornq_u64 return vornq_u64(a, b); - // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } Index: test/CodeGen/aarch64-neon-across.c =================================================================== --- test/CodeGen/aarch64-neon-across.c +++ test/CodeGen/aarch64-neon-across.c @@ -1,271 +1,398 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 { +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 +// CHECK: ret i16 [[TMP0]] int16_t test_vaddlv_s8(int8x8_t a) { - // CHECK-LABEL: test_vaddlv_s8 return vaddlv_s8(a); - // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDLV_I]] int32_t test_vaddlv_s16(int16x4_t a) { - // CHECK-LABEL: test_vaddlv_s16 return vaddlv_s16(a); - // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 { +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 +// CHECK: ret i16 [[TMP0]] uint16_t test_vaddlv_u8(uint8x8_t a) { - // CHECK-LABEL: test_vaddlv_u8 return vaddlv_u8(a); - // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDLV_I]] uint32_t test_vaddlv_u16(uint16x4_t a) { - // CHECK-LABEL: test_vaddlv_u16 return vaddlv_u16(a); - // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 { +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 +// CHECK: ret i16 [[TMP0]] int16_t test_vaddlvq_s8(int8x16_t a) { - // CHECK-LABEL: test_vaddlvq_s8 return vaddlvq_s8(a); - // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDLV_I]] int32_t test_vaddlvq_s16(int16x8_t a) { - // CHECK-LABEL: test_vaddlvq_s16 return vaddlvq_s16(a); - // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i64 [[VADDLVQ_S32_I]] int64_t test_vaddlvq_s32(int32x4_t a) { - // CHECK-LABEL: test_vaddlvq_s32 return vaddlvq_s32(a); - // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 { +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 +// CHECK: ret i16 [[TMP0]] uint16_t test_vaddlvq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vaddlvq_u8 return vaddlvq_u8(a); - // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDLV_I]] uint32_t test_vaddlvq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vaddlvq_u16 return vaddlvq_u16(a); - // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i64 [[VADDLVQ_U32_I]] uint64_t test_vaddlvq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vaddlvq_u32 return vaddlvq_u32(a); - // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 { +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vmaxv_s8(int8x8_t a) { - // CHECK-LABEL: test_vmaxv_s8 return vmaxv_s8(a); - // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vmaxv_s16(int16x4_t a) { - // CHECK-LABEL: test_vmaxv_s16 return vmaxv_s16(a); - // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 { +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vmaxv_u8(uint8x8_t a) { - // CHECK-LABEL: test_vmaxv_u8 return vmaxv_u8(a); - // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vmaxv_u16(uint16x4_t a) { - // CHECK-LABEL: test_vmaxv_u16 return vmaxv_u16(a); - // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 { +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vmaxvq_s8(int8x16_t a) { - // CHECK-LABEL: test_vmaxvq_s8 return vmaxvq_s8(a); - // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vmaxvq_s16(int16x8_t a) { - // CHECK-LABEL: test_vmaxvq_s16 return vmaxvq_s16(a); - // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VMAXVQ_S32_I]] int32_t test_vmaxvq_s32(int32x4_t a) { - // CHECK-LABEL: test_vmaxvq_s32 return vmaxvq_s32(a); - // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 { +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vmaxvq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vmaxvq_u8 return vmaxvq_u8(a); - // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vmaxvq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vmaxvq_u16 return vmaxvq_u16(a); - // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VMAXVQ_U32_I]] uint32_t test_vmaxvq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vmaxvq_u32 return vmaxvq_u32(a); - // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 { +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vminv_s8(int8x8_t a) { - // CHECK-LABEL: test_vminv_s8 return vminv_s8(a); - // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vminv_s16(int16x4_t a) { - // CHECK-LABEL: test_vminv_s16 return vminv_s16(a); - // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 { +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vminv_u8(uint8x8_t a) { - // CHECK-LABEL: test_vminv_u8 return vminv_u8(a); - // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vminv_u16(uint16x4_t a) { - // CHECK-LABEL: test_vminv_u16 return vminv_u16(a); - // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 { +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vminvq_s8(int8x16_t a) { - // CHECK-LABEL: test_vminvq_s8 return vminvq_s8(a); - // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vminvq_s16(int16x8_t a) { - // CHECK-LABEL: test_vminvq_s16 return vminvq_s16(a); - // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VMINVQ_S32_I]] int32_t test_vminvq_s32(int32x4_t a) { - // CHECK-LABEL: test_vminvq_s32 return vminvq_s32(a); - // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 { +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vminvq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vminvq_u8 return vminvq_u8(a); - // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vminvq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vminvq_u16 return vminvq_u16(a); - // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VMINVQ_U32_I]] uint32_t test_vminvq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vminvq_u32 return vminvq_u32(a); - // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 { +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vaddv_s8(int8x8_t a) { - // CHECK-LABEL: test_vaddv_s8 return vaddv_s8(a); - // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vaddv_s16(int16x4_t a) { - // CHECK-LABEL: test_vaddv_s16 return vaddv_s16(a); - // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 { +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vaddv_u8(uint8x8_t a) { - // CHECK-LABEL: test_vaddv_u8 return vaddv_u8(a); - // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vaddv_u16(uint16x4_t a) { - // CHECK-LABEL: test_vaddv_u16 return vaddv_u16(a); - // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h } +// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 { +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 +// CHECK: ret i8 [[TMP0]] int8_t test_vaddvq_s8(int8x16_t a) { - // CHECK-LABEL: test_vaddvq_s8 return vaddvq_s8(a); - // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16 +// CHECK: ret i16 [[TMP2]] int16_t test_vaddvq_s16(int16x8_t a) { - // CHECK-LABEL: test_vaddvq_s16 return vaddvq_s16(a); - // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDVQ_S32_I]] int32_t test_vaddvq_s32(int32x4_t a) { - // CHECK-LABEL: test_vaddvq_s32 return vaddvq_s32(a); - // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 { +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2 +// CHECK: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 +// CHECK: ret i8 [[TMP0]] uint8_t test_vaddvq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vaddvq_u8 return vaddvq_u8(a); - // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2 +// CHECK: [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16 +// CHECK: ret i16 [[TMP2]] uint16_t test_vaddvq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vaddvq_u16 return vaddvq_u16(a); - // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h } +// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2 +// CHECK: ret i32 [[VADDVQ_U32_I]] uint32_t test_vaddvq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vaddvq_u32 return vaddvq_u32(a); - // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2 +// CHECK: ret float [[VMAXVQ_F32_I]] float32_t test_vmaxvq_f32(float32x4_t a) { - // CHECK-LABEL: test_vmaxvq_f32 return vmaxvq_f32(a); - // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2 +// CHECK: ret float [[VMINVQ_F32_I]] float32_t test_vminvq_f32(float32x4_t a) { - // CHECK-LABEL: test_vminvq_f32 return vminvq_f32(a); - // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2 +// CHECK: ret float [[VMAXNMVQ_F32_I]] float32_t test_vmaxnmvq_f32(float32x4_t a) { - // CHECK-LABEL: test_vmaxnmvq_f32 return vmaxnmvq_f32(a); - // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s } +// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2 +// CHECK: ret float [[VMINNMVQ_F32_I]] float32_t test_vminnmvq_f32(float32x4_t a) { - // CHECK-LABEL: test_vminnmvq_f32 return vminnmvq_f32(a); - // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s } Index: test/CodeGen/aarch64-neon-extract.c =================================================================== --- test/CodeGen/aarch64-neon-extract.c +++ test/CodeGen/aarch64-neon-extract.c @@ -1,148 +1,247 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vext_s8 return vext_s8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}} } +// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vext_s16 return vext_s16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}} } +// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i32> [[VEXT]] int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vext_s32 return vext_s32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}} } +// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[VEXT]] int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vext_s64 return vext_s64(a, b, 0); } +// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vextq_s8 return vextq_s8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}} } +// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vextq_s16 return vextq_s16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}} } +// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i32> [[VEXT]] int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vextq_s32 return vextq_s32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}} } +// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i64> [[VEXT]] int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vextq_s64 return vextq_s64(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}} } +// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vext_u8 return vext_u8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}} } +// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vext_u16 return vext_u16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}} } +// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i32> [[VEXT]] uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vext_u32 return vext_u32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}} } +// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[VEXT]] uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vext_u64 return vext_u64(a, b, 0); } +// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vextq_u8 return vextq_u8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}} } +// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vextq_u16 return vextq_u16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}} } +// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i32> [[VEXT]] uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vextq_u32 return vextq_u32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}} } +// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i64> [[VEXT]] uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vextq_u64 return vextq_u64(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}} } +// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> +// CHECK: ret <2 x float> [[VEXT]] float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vext_f32 return vext_f32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}} } +// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x double> [[VEXT]] float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vext_f64 return vext_f64(a, b, 0); } +// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +// CHECK: ret <4 x float> [[VEXT]] float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vextq_f32 return vextq_f32(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}} } +// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> +// CHECK: ret <2 x double> [[VEXT]] float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vextq_f64 return vextq_f64(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}} } +// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vext_p8 return vext_p8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}} } +// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vext_p16 return vext_p16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}} } +// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vextq_p8 return vextq_p8(a, b, 2); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}} } +// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vextq_p16 return vextq_p16(a, b, 3); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}} } Index: test/CodeGen/aarch64-neon-fcvt-intrinsics.c =================================================================== --- test/CodeGen/aarch64-neon-fcvt-intrinsics.c +++ test/CodeGen/aarch64-neon-fcvt-intrinsics.c @@ -1,133 +1,153 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 { +// CHECK: [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2 +// CHECK: ret float [[VCVTXD_F32_F64_I]] float32_t test_vcvtxd_f32_f64(float64_t a) { -// CHECK-LABEL: test_vcvtxd_f32_f64 -// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}} return (float32_t)vcvtxd_f32_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 { +// CHECK: [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTAS_S32_F32_I]] int32_t test_vcvtas_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtas_s32_f32 -// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}} return (int32_t)vcvtas_s32_f32(a); } +// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 { +// CHECK: [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTAD_S64_F64_I]] int64_t test_test_vcvtad_s64_f64(float64_t a) { -// CHECK-LABEL: test_test_vcvtad_s64_f64 -// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}} return (int64_t)vcvtad_s64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 { +// CHECK: [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTAS_U32_F32_I]] uint32_t test_vcvtas_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtas_u32_f32 -// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}} return (uint32_t)vcvtas_u32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 { +// CHECK: [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTAD_U64_F64_I]] uint64_t test_vcvtad_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtad_u64_f64 -// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}} return (uint64_t)vcvtad_u64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 { +// CHECK: [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTMS_S32_F32_I]] int32_t test_vcvtms_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtms_s32_f32 -// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}} return (int32_t)vcvtms_s32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 { +// CHECK: [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTMD_S64_F64_I]] int64_t test_vcvtmd_s64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtmd_s64_f64 -// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}} return (int64_t)vcvtmd_s64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 { +// CHECK: [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTMS_U32_F32_I]] uint32_t test_vcvtms_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtms_u32_f32 -// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}} return (uint32_t)vcvtms_u32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 { +// CHECK: [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTMD_U64_F64_I]] uint64_t test_vcvtmd_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtmd_u64_f64 -// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}} return (uint64_t)vcvtmd_u64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 { +// CHECK: [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTNS_S32_F32_I]] int32_t test_vcvtns_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtns_s32_f32 -// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}} return (int32_t)vcvtns_s32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 { +// CHECK: [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTND_S64_F64_I]] int64_t test_vcvtnd_s64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtnd_s64_f64 -// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}} return (int64_t)vcvtnd_s64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 { +// CHECK: [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTNS_U32_F32_I]] uint32_t test_vcvtns_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtns_u32_f32 -// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}} return (uint32_t)vcvtns_u32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 { +// CHECK: [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTND_U64_F64_I]] uint64_t test_vcvtnd_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtnd_u64_f64 -// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}} return (uint64_t)vcvtnd_u64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 { +// CHECK: [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTPS_S32_F32_I]] int32_t test_vcvtps_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtps_s32_f32 -// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}} return (int32_t)vcvtps_s32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 { +// CHECK: [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTPD_S64_F64_I]] int64_t test_vcvtpd_s64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtpd_s64_f64 -// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}} return (int64_t)vcvtpd_s64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 { +// CHECK: [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2 +// CHECK: ret i32 [[VCVTPS_U32_F32_I]] uint32_t test_vcvtps_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvtps_u32_f32 -// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}} return (uint32_t)vcvtps_u32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 { +// CHECK: [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2 +// CHECK: ret i64 [[VCVTPD_U64_F64_I]] uint64_t test_vcvtpd_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtpd_u64_f64 -// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}} return (uint64_t)vcvtpd_u64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fptosi float %a to i32 +// CHECK: ret i32 [[TMP0]] int32_t test_vcvts_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvts_s32_f32 -// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}} return (int32_t)vcvts_s32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fptosi double %a to i64 +// CHECK: ret i64 [[TMP0]] int64_t test_vcvtd_s64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtd_s64_f64 -// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}} return (int64_t)vcvtd_s64_f64(a); } +// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fptoui float %a to i32 +// CHECK: ret i32 [[TMP0]] uint32_t test_vcvts_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvts_u32_f32 -// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}} return (uint32_t)vcvts_u32_f32(a); } +// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fptoui double %a to i64 +// CHECK: ret i64 [[TMP0]] uint64_t test_vcvtd_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtd_u64_f64 -// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}} return (uint64_t)vcvtd_u64_f64(a); } Index: test/CodeGen/aarch64-neon-fma.c =================================================================== --- test/CodeGen/aarch64-neon-fma.c +++ test/CodeGen/aarch64-neon-fma.c @@ -1,199 +1,243 @@ -// REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { - // CHECK-LABEL: test_vmla_n_f32 return vmla_n_f32(a, b, c); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { - // CHECK-LABEL: test_vmlaq_n_f32 return vmlaq_n_f32(a, b, c); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]] +// CHECK: ret <2 x double> [[ADD_I]] float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { - // CHECK-LABEL: test_vmlaq_n_f64 return vmlaq_n_f64(a, b, c); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] - // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d - // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] - // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { - // CHECK-LABEL: test_vmlsq_n_f32 return vmlsq_n_f32(a, b, c); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { - // CHECK-LABEL: test_vmls_n_f32 return vmls_n_f32(a, b, c); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]] +// CHECK: ret <2 x double> [[SUB_I]] float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { - // CHECK-LABEL: test_vmlsq_n_f64 return vmlsq_n_f64(a, b, c); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] - // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d - // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] - // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vmla_lane_f32_0 return vmla_lane_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_f32_0 return vmlaq_lane_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_f32_0 return vmla_laneq_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_f32_0 return vmlaq_laneq_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vmls_lane_f32_0 return vmls_lane_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_f32_0 return vmlsq_lane_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_f32_0 return vmls_laneq_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_f32_0 return vmlsq_laneq_f32(a, b, v, 0); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] } +// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vmla_lane_f32 return vmla_lane_f32(a, b, v, 1); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vmlaq_lane_f32 return vmlaq_lane_f32(a, b, v, 1); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vmla_laneq_f32 return vmla_laneq_f32(a, b, v, 3); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vmlaq_laneq_f32 return vmlaq_laneq_f32(a, b, v, 3); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { - // CHECK-LABEL: test_vmls_lane_f32 return vmls_lane_f32(a, b, v, 1); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { - // CHECK-LABEL: test_vmlsq_lane_f32 return vmlsq_lane_f32(a, b, v, 1); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { - // CHECK-LABEL: test_vmls_laneq_f32 return vmls_laneq_f32(a, b, v, 3); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s - // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { - // CHECK-LABEL: test_vmlsq_laneq_f32 return vmlsq_laneq_f32(a, b, v, 3); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2 +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { - // CHECK-LABEL: test_vfmaq_n_f64: return vfmaq_n_f64(a, b, c); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}} } +// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP4:%.*]] = fsub <2 x double> , [[TMP3]] +// CHECK: [[FMLS_I_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[FMLS1_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[FMLS2_I_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I_I]]) #2 +// CHECK: ret <2 x double> [[FMLS2_I_I]] float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { - // CHECK-LABEL: test_vfmsq_n_f64: return vfmsq_n_f64(a, b, c); - // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}} } Index: test/CodeGen/aarch64-neon-intrinsics.c =================================================================== --- test/CodeGen/aarch64-neon-intrinsics.c +++ test/CodeGen/aarch64-neon-intrinsics.c @@ -1,334 +1,394 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64 +// RUN: -fallow-half-arguments-and-returns -ffp-contract=fast -S -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg \ +// RUN: | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vadd_s8 return vadd_s8(v1, v2); - // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vadd_s16 return vadd_s16(v1, v2); - // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vadd_s32 return vadd_s32(v1, v2); - // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %v1, <1 x i64> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 +// CHECK: ret <1 x i64> [[ADD_I]] int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) { - // CHECK-LABEL: test_vadd_s64 return vadd_s64(v1, v2); - // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2 +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vadd_f32 return vadd_f32(v1, v2); - // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vadd_u8 return vadd_u8(v1, v2); - // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vadd_u16 return vadd_u16(v1, v2); - // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vadd_u32 return vadd_u32(v1, v2); - // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %v1, <1 x i64> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 +// CHECK: ret <1 x i64> [[ADD_I]] uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) { - // CHECK-LABEL: test_vadd_u64 return vadd_u64(v1, v2); - // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vaddq_s8 return vaddq_s8(v1, v2); - // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vaddq_s16 return vaddq_s16(v1, v2); - // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t v2) { - // CHECK-LABEL: test_vaddq_s32 return vaddq_s32(v1, v2); - // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vaddq_s64 return vaddq_s64(v1, v2); - // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2 +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vaddq_f32 return vaddq_f32(v1, v2); - // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vaddq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2 +// CHECK: ret <2 x double> [[ADD_I]] float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vaddq_f64 return vaddq_f64(v1, v2); - // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vaddq_u8 return vaddq_u8(v1, v2); - // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vaddq_u16 return vaddq_u16(v1, v2); - // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK: vaddq_u32 return vaddq_u32(v1, v2); - // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vaddq_u64 return vaddq_u64(v1, v2); - // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vsub_s8 return vsub_s8(v1, v2); - // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vsub_s16 return vsub_s16(v1, v2); - // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vsub_s32 return vsub_s32(v1, v2); - // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %v1, <1 x i64> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 +// CHECK: ret <1 x i64> [[SUB_I]] int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) { - // CHECK-LABEL: test_vsub_s64 return vsub_s64(v1, v2); - // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2 +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vsub_f32 return vsub_f32(v1, v2); - // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[SUB_I]] uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vsub_u8 return vsub_u8(v1, v2); - // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vsub_u16 return vsub_u16(v1, v2); - // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vsub_u32 return vsub_u32(v1, v2); - // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %v1, <1 x i64> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 +// CHECK: ret <1 x i64> [[SUB_I]] uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) { - // CHECK-LABEL: test_vsub_u64 return vsub_u64(v1, v2); - // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vsubq_s8 return vsubq_s8(v1, v2); - // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vsubq_s16 return vsubq_s16(v1, v2); - // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t v2) { - // CHECK-LABEL: test_vsubq_s32 return vsubq_s32(v1, v2); - // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vsubq_s64 return vsubq_s64(v1, v2); - // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2 +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vsubq_f32 return vsubq_f32(v1, v2); - // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vsubq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2 +// CHECK: ret <2 x double> [[SUB_I]] float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vsubq_f64 return vsubq_f64(v1, v2); - // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[SUB_I]] uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vsubq_u8 return vsubq_u8(v1, v2); - // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vsubq_u16 return vsubq_u16(v1, v2); - // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK: vsubq_u32 return vsubq_u32(v1, v2); - // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vsubq_u64 return vsubq_u64(v1, v2); - // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[MUL_I]] int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vmul_s8 return vmul_s8(v1, v2); - // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[MUL_I]] int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vmul_s16 return vmul_s16(v1, v2); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[MUL_I]] int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vmul_s32 return vmul_s32(v1, v2); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2 +// CHECK: ret <2 x float> [[MUL_I]] float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vmul_f32 return vmul_f32(v1, v2); - // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 +// CHECK: ret <8 x i8> [[MUL_I]] uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vmul_u8 return vmul_u8(v1, v2); - // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 +// CHECK: ret <4 x i16> [[MUL_I]] uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vmul_u16 return vmul_u16(v1, v2); - // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 +// CHECK: ret <2 x i32> [[MUL_I]] uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vmul_u32 return vmul_u32(v1, v2); - // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[MUL_I]] int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vmulq_s8 return vmulq_s8(v1, v2); - // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[MUL_I]] int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vmulq_s16 return vmulq_s16(v1, v2); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[MUL_I]] int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vmulq_s32 return vmulq_s32(v1, v2); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 +// CHECK: ret <16 x i8> [[MUL_I]] uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vmulq_u8 return vmulq_u8(v1, v2); - // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 +// CHECK: ret <8 x i16> [[MUL_I]] uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vmulq_u16 return vmulq_u16(v1, v2); - // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 +// CHECK: ret <4 x i32> [[MUL_I]] uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vmulq_u32 return vmulq_u32(v1, v2); - // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2 +// CHECK: ret <4 x float> [[MUL_I]] float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vmulq_f32 return vmulq_f32(v1, v2); - // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmulq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2 +// CHECK: ret <2 x double> [[MUL_I]] float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vmulq_f64 return vmulq_f64(v1, v2); - // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VMUL_V_I]] poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) { // test_vmul_p8 return vmul_p8(v1, v2); // pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VMULQ_V_I]] poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) { // test_vmulq_p8 return vmulq_p8(v1, v2); @@ -336,1295 +396,2132 @@ } +// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { - // CHECK-LABEL: test_vmla_s8 return vmla_s8(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vmla_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { - // CHECK-LABEL: test_vmla_s16 return vmla_s16(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { - // CHECK-LABEL: test_vmla_s32 return vmla_s32(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]] +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { - // CHECK-LABEL: test_vmla_f32 return vmla_f32(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { - // CHECK-LABEL: test_vmla_u8 return vmla_u8(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { - // CHECK-LABEL: test_vmla_u16 return vmla_u16(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { - // CHECK-LABEL: test_vmla_u32 return vmla_u32(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { - // CHECK-LABEL: test_vmlaq_s8 return vmlaq_s8(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { - // CHECK-LABEL: test_vmlaq_s16 return vmlaq_s16(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { - // CHECK-LABEL: test_vmlaq_s32 return vmlaq_s32(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]] +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - // CHECK-LABEL: test_vmlaq_f32 return vmlaq_f32(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { - // CHECK-LABEL: test_vmlaq_u8 return vmlaq_u8(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { - // CHECK-LABEL: test_vmlaq_u16 return vmlaq_u16(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { - // CHECK-LABEL: test_vmlaq_u32 return vmlaq_u32(v1, v2, v3); - // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmlaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 +// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]] +// CHECK: ret <2 x double> [[ADD_I]] float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - // CHECK-LABEL: test_vmlaq_f64 return vmlaq_f64(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { - // CHECK-LABEL: test_vmls_s8 return vmls_s8(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vmls_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { - // CHECK-LABEL: test_vmls_s16 return vmls_s16(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { - // CHECK-LABEL: test_vmls_s32 return vmls_s32(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]] +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { - // CHECK-LABEL: test_vmls_f32 return vmls_f32(v1, v2, v3); - // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] +// CHECK: ret <8 x i8> [[SUB_I]] uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { - // CHECK-LABEL: test_vmls_u8 return vmls_u8(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { - // CHECK-LABEL: test_vmls_u16 return vmls_u16(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { - // CHECK-LABEL: test_vmls_u32 return vmls_u32(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { - // CHECK-LABEL: test_vmlsq_s8 return vmlsq_s8(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { - // CHECK-LABEL: test_vmlsq_s16 return vmlsq_s16(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { - // CHECK-LABEL: test_vmlsq_s32 return vmlsq_s32(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]] +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - // CHECK-LABEL: test_vmlsq_f32 return vmlsq_f32(v1, v2, v3); - // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] +// CHECK: ret <16 x i8> [[SUB_I]] uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { - // CHECK-LABEL: test_vmlsq_u8 return vmlsq_u8(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { - // CHECK-LABEL: test_vmlsq_u16 return vmlsq_u16(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { - // CHECK-LABEL: test_vmlsq_u32 return vmlsq_u32(v1, v2, v3); - // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmlsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 +// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]] +// CHECK: ret <2 x double> [[SUB_I]] float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - // CHECK-LABEL: test_vmlsq_f64 return vmlsq_f64(v1, v2, v3); - // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4 +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { - // CHECK-LABEL: test_vfma_f32 return vfma_f32(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4 +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - // CHECK-LABEL: test_vfmaq_f32 return vfmaq_f32(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vfmaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4 +// CHECK: ret <2 x double> [[TMP6]] float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - // CHECK-LABEL: test_vfmaq_f64 return vfmaq_f64(v1, v2, v3); - // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP4:%.*]] = fsub <2 x float> , [[TMP3]] +// CHECK: [[FMLS_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[FMLS1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[FMLS2_I:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLS_I]], <2 x float> [[TMP4]], <2 x float> [[FMLS1_I]]) #4 +// CHECK: ret <2 x float> [[FMLS2_I]] float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { - // CHECK-LABEL: test_vfms_f32 return vfms_f32(v1, v2, v3); - // CHECK: fmls v0.2s, {{v1.2s, v2.2s|v2.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP4:%.*]] = fsub <4 x float> , [[TMP3]] +// CHECK: [[FMLS_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[FMLS1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[FMLS2_I:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLS_I]], <4 x float> [[TMP4]], <4 x float> [[FMLS1_I]]) #4 +// CHECK: ret <4 x float> [[FMLS2_I]] float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - // CHECK-LABEL: test_vfmsq_f32 return vfmsq_f32(v1, v2, v3); - // CHECK: fmls v0.4s, {{v1.4s, v2.4s|v2.4s, v1.4s}} } +// CHECK-LABEL: define <2 x double> @test_vfmsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP4:%.*]] = fsub <2 x double> , [[TMP3]] +// CHECK: [[FMLS_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[FMLS1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[FMLS2_I:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLS_I]], <2 x double> [[TMP4]], <2 x double> [[FMLS1_I]]) #4 +// CHECK: ret <2 x double> [[FMLS2_I]] float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - // CHECK: vfmsq_f64 return vfmsq_f64(v1, v2, v3); - // CHECK: fmls v0.2d, {{v1.2d, v2.2d|v2.2d, v1.2d}} } +// CHECK-LABEL: define <2 x double> @test_vdivq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2 +// CHECK: ret <2 x double> [[DIV_I]] float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vdivq_f64 return vdivq_f64(v1, v2); - // CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vdivq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2 +// CHECK: ret <4 x float> [[DIV_I]] float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vdivq_f32 return vdivq_f32(v1, v2); - // CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vdiv_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2 +// CHECK: ret <2 x float> [[DIV_I]] float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vdiv_f32 return vdiv_f32(v1, v2); - // CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { - // CHECK-LABEL: test_vaba_s8 return vaba_s8(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { - // CHECK-LABEL: test_vaba_s16 return vaba_s16(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { - // CHECK-LABEL: test_vaba_s32 return vaba_s32(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { - // CHECK-LABEL: test_vaba_u8 return vaba_u8(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { - // CHECK-LABEL: test_vaba_u16 return vaba_u16(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { - // CHECK-LABEL: test_vaba_u32 return vaba_u32(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { - // CHECK-LABEL: test_vabaq_s8 return vabaq_s8(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { - // CHECK-LABEL: test_vabaq_s16 return vabaq_s16(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { - // CHECK-LABEL: test_vabaq_s32 return vabaq_s32(v1, v2, v3); - // CHECK: saba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { - // CHECK-LABEL: test_vabaq_u8 return vabaq_u8(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { - // CHECK-LABEL: test_vabaq_u16 return vabaq_u16(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { - // CHECK-LABEL: test_vabaq_u32 return vabaq_u32(v1, v2, v3); - // CHECK: uaba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VABD_I]] int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vabd_s8 return vabd_s8(v1, v2); - // CHECK: sabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4 +// CHECK: ret <4 x i16> [[VABD2_I]] int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vabd_s16 return vabd_s16(v1, v2); - // CHECK: sabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4 +// CHECK: ret <2 x i32> [[VABD2_I]] int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vabd_s32 return vabd_s32(v1, v2); - // CHECK: sabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VABD_I]] uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vabd_u8 return vabd_u8(v1, v2); - // CHECK: uabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4 +// CHECK: ret <4 x i16> [[VABD2_I]] uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vabd_u16 return vabd_u16(v1, v2); - // CHECK: uabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4 +// CHECK: ret <2 x i32> [[VABD2_I]] uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vabd_u32 return vabd_u32(v1, v2); - // CHECK: uabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4 +// CHECK: ret <2 x float> [[VABD2_I]] float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vabd_f32 return vabd_f32(v1, v2); - // CHECK: fabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VABD_I]] int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vabdq_s8 return vabdq_s8(v1, v2); - // CHECK: sabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4 +// CHECK: ret <8 x i16> [[VABD2_I]] int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vabdq_s16 return vabdq_s16(v1, v2); - // CHECK: sabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4 +// CHECK: ret <4 x i32> [[VABD2_I]] int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vabdq_s32 return vabdq_s32(v1, v2); - // CHECK: sabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VABD_I]] uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vabdq_u8 return vabdq_u8(v1, v2); - // CHECK: uabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4 +// CHECK: ret <8 x i16> [[VABD2_I]] uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vabdq_u16 return vabdq_u16(v1, v2); - // CHECK: uabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4 +// CHECK: ret <4 x i32> [[VABD2_I]] uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vabdq_u32 return vabdq_u32(v1, v2); - // CHECK: uabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4 +// CHECK: ret <4 x float> [[VABD2_I]] float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vabdq_f32 return vabdq_f32(v1, v2); - // CHECK: fabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vabdq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) #4 +// CHECK: ret <2 x double> [[VABD2_I]] float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vabdq_f64 return vabdq_f64(v1, v2); - // CHECK: fabd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) { - // CHECK-LABEL: test_vbsl_s8 return vbsl_s8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[TMP4]] int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) { - // CHECK-LABEL: test_vbsl_s16 return vbsl_s16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i32> [[VBSL5_I]] int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) { - // CHECK-LABEL: test_vbsl_s32 return vbsl_s32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <1 x i64> [[VBSL5_I]] uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) { - // CHECK-LABEL: test_vbsl_s64 return vbsl_s64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { - // CHECK-LABEL: test_vbsl_u8 return vbsl_u8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { - // CHECK-LABEL: test_vbsl_u16 return vbsl_u16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i32> [[VBSL5_I]] uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { - // CHECK-LABEL: test_vbsl_u32 return vbsl_u32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <1 x i64> [[VBSL5_I]] uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) { - // CHECK-LABEL: test_vbsl_u64 return vbsl_u64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP4:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP5]] float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { - // CHECK-LABEL: test_vbsl_f32 return vbsl_f32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double> +// CHECK: ret <1 x double> [[TMP4]] float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) { - // CHECK-LABEL: test_vbsl_f64 return vbsl_f64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) { - // CHECK-LABEL: test_vbsl_p8 return vbsl_p8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) { - // CHECK-LABEL: test_vbsl_p16 return vbsl_p16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) { - // CHECK-LABEL: test_vbslq_s8 return vbslq_s8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) { - // CHECK-LABEL: test_vbslq_s16 return vbslq_s16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i32> [[VBSL5_I]] int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { - // CHECK-LABEL: test_vbslq_s32 return vbslq_s32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i64> [[VBSL5_I]] int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) { - // CHECK-LABEL: test_vbslq_s64 return vbslq_s64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { - // CHECK-LABEL: test_vbslq_u8 return vbslq_u8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { - // CHECK-LABEL: test_vbslq_u16 return vbslq_u16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i32> [[VBSL5_I]] int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { - // CHECK-LABEL: test_vbslq_u32 return vbslq_s32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i64> [[VBSL5_I]] uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) { - // CHECK-LABEL: test_vbslq_u64 return vbslq_u64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP4]] float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) { - // CHECK-LABEL: test_vbslq_f32 return vbslq_f32(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) { - // CHECK-LABEL: test_vbslq_p8 return vbslq_p8(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) { - // CHECK-LABEL: test_vbslq_p16 return vbslq_p16(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double> +// CHECK: ret <2 x double> [[TMP4]] float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) { - // CHECK-LABEL: test_vbslq_f64 return vbslq_f64(v1, v2, v3); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4 +// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vrecps_f32 return vrecps_f32(v1, v2); - // CHECK: frecps {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4 +// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vrecpsq_f32 return vrecpsq_f32(v1, v2); - // CHECK: frecps {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrecpsq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) #4 +// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x double> +// CHECK: ret <2 x double> [[TMP2]] float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vrecpsq_f64 return vrecpsq_f64(v1, v2); - // CHECK: frecps {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4 +// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vrsqrts_f32 return vrsqrts_f32(v1, v2); - // CHECK: frsqrts {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4 +// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vrsqrtsq_f32 return vrsqrtsq_f32(v1, v2); - // CHECK: frsqrts {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrsqrtsq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) #4 +// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x double> +// CHECK: ret <2 x double> [[TMP2]] float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vrsqrtsq_f64 return vrsqrtsq_f64(v1, v2); - // CHECK: frsqrts {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCAGE_V2_I]] uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcage_f32 return vcage_f32(v1, v2); - // CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) #4 +// CHECK: ret <1 x i64> [[VCAGE_V2_I]] uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcage_f64 return vcage_f64(a, b); - // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcageq_f32 return vcageq_f32(v1, v2); - // CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcageq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) #4 +// CHECK: ret <2 x i64> [[VCAGEQ_V2_I]] uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcageq_f64 return vcageq_f64(v1, v2); - // CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCAGT_V2_I]] uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcagt_f32 return vcagt_f32(v1, v2); - // CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) #4 +// CHECK: ret <1 x i64> [[VCAGT_V2_I]] uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcagt_f64 return vcagt_f64(a, b); - // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcagtq_f32 return vcagtq_f32(v1, v2); - // CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcagtq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) #4 +// CHECK: ret <2 x i64> [[VCAGTQ_V2_I]] uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcagtq_f64 return vcagtq_f64(v1, v2); - // CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCALE_V2_I]] uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcale_f32 return vcale_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facge {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) #4 +// CHECK: ret <1 x i64> [[VCALE_V2_I]] uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcale_f64 return vcale_f64(a, b); - // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcaleq_f32 return vcaleq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facge {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <2 x i64> @test_vcaleq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) #4 +// CHECK: ret <2 x i64> [[VCALEQ_V2_I]] uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcaleq_f64 return vcaleq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facge {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> +// CHECK: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCALT_V2_I]] uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcalt_f32 return vcalt_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facgt {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) #4 +// CHECK: ret <1 x i64> [[VCALT_V2_I]] uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcalt_f64 return vcalt_f64(a, b); - // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> +// CHECK: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcaltq_f32 return vcaltq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facgt {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <2 x i64> @test_vcaltq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> +// CHECK: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) #4 +// CHECK: ret <2 x i64> [[VCALTQ_V2_I]] uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcaltq_f64 return vcaltq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. - // CHECK: facgt {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vtst_s8 return vtst_s8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vtst_s16 return vtst_s16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK: ret <2 x i32> [[VTST_I]] uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vtst_s32 return vtst_s32(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vtst_u8 return vtst_u8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vtst_u16 return vtst_u16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK: ret <2 x i32> [[VTST_I]] uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vtst_u32 return vtst_u32(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vtstq_s8 return vtstq_s8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vtstq_s16 return vtstq_s16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK: ret <4 x i32> [[VTST_I]] uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vtstq_s32 return vtstq_s32(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vtstq_u8 return vtstq_u8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vtstq_u16 return vtstq_u16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK: ret <4 x i32> [[VTST_I]] uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vtstq_u32 return vtstq_u32(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vtstq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK: ret <2 x i64> [[VTST_I]] uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vtstq_s64 return vtstq_s64(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vtstq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK: ret <2 x i64> [[VTST_I]] uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vtstq_u64 return vtstq_u64(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) { - // CHECK-LABEL: test_vtst_p8 return vtst_p8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) { - // CHECK-LABEL: test_vtst_p16 return vtst_p16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) { - // CHECK-LABEL: test_vtstq_p8 return vtstq_p8(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) { - // CHECK-LABEL: test_vtstq_p16 return vtstq_p16(v1, v2); - // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK: ret <1 x i64> [[VTST_I]] uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vtst_s64 return vtst_s64(a, b); - // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK: ret <1 x i64> [[VTST_I]] uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vtst_u64 return vtst_u64(a, b); - // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vceq_s8 return vceq_s8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vceq_s16 return vceq_s16(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vceq_s32 return vceq_s32(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vceq_s64 return vceq_s64(a, b); - // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vceq_u64 return vceq_u64(a, b); - // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vceq_f32 return vceq_f32(v1, v2); - // CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vceq_f64 return vceq_f64(a, b); - // CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vceq_u8 return vceq_u8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vceq_u16 return vceq_u16(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vceq_u32 return vceq_u32(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) { - // CHECK-LABEL: test_vceq_p8 return vceq_p8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vceqq_s8 return vceqq_s8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vceqq_s16 return vceqq_s16(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vceqq_s32 return vceqq_s32(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vceqq_f32 return vceqq_f32(v1, v2); - // CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vceqq_u8 return vceqq_u8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vceqq_u16 return vceqq_u16(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vceqq_u32 return vceqq_u32(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) { - // CHECK-LABEL: test_vceqq_p8 return vceqq_p8(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i64> @test_vceqq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vceqq_s64 return vceqq_s64(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vceqq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vceqq_u64 return vceqq_u64(v1, v2); - // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vceqq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vceqq_f64 return vceqq_f64(v1, v2); - // CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) { -// CHECK-LABEL: test_vcge_s8 return vcge_s8(v1, v2); -// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) { -// CHECK-LABEL: test_vcge_s16 return vcge_s16(v1, v2); -// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) { -// CHECK-LABEL: test_vcge_s32 return vcge_s32(v1, v2); -// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vcge_s64 return vcge_s64(a, b); - // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vcge_u64 return vcge_u64(a, b); - // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) { -// CHECK-LABEL: test_vcge_f32 return vcge_f32(v1, v2); -// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcge_f64 return vcge_f64(a, b); - // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) { -// CHECK-LABEL: test_vcge_u8 return vcge_u8(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) { -// CHECK-LABEL: test_vcge_u16 return vcge_u16(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) { -// CHECK-LABEL: test_vcge_u32 return vcge_u32(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) { -// CHECK-LABEL: test_vcgeq_s8 return vcgeq_s8(v1, v2); -// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) { -// CHECK-LABEL: test_vcgeq_s16 return vcgeq_s16(v1, v2); -// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) { -// CHECK-LABEL: test_vcgeq_s32 return vcgeq_s32(v1, v2); -// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) { -// CHECK-LABEL: test_vcgeq_f32 return vcgeq_f32(v1, v2); -// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) { -// CHECK-LABEL: test_vcgeq_u8 return vcgeq_u8(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) { -// CHECK-LABEL: test_vcgeq_u16 return vcgeq_u16(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) { -// CHECK-LABEL: test_vcgeq_u32 return vcgeq_u32(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcgeq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) { -// CHECK-LABEL: test_vcgeq_s64 return vcgeq_s64(v1, v2); -// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vcgeq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) { -// CHECK-LABEL: test_vcgeq_u64 return vcgeq_u64(v1, v2); -// CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vcgeq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) { -// CHECK-LABEL: test_vcgeq_f64 return vcgeq_f64(v1, v2); -// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } // Notes about vcle: // LE condition predicate implemented as GE, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vcle_s8 return vcle_s8(v1, v2); - // CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b } +// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vcle_s16 return vcle_s16(v1, v2); - // CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h } +// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vcle_s32 return vcle_s32(v1, v2); - // CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vcle_s64 return vcle_s64(a, b); - // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vcle_u64 return vcle_u64(a, b); - // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcle_f32 return vcle_f32(v1, v2); - // CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcle_f64 return vcle_f64(a, b); - // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vcle_u8 return vcle_u8(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b } +// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vcle_u16 return vcle_u16(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h } +// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vcle_u32 return vcle_u32(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vcleq_s8 return vcleq_s8(v1, v2); - // CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b } +// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vcleq_s16 return vcleq_s16(v1, v2); - // CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h } +// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vcleq_s32 return vcleq_s32(v1, v2); - // CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcleq_f32 return vcleq_f32(v1, v2); - // CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vcleq_u8 return vcleq_u8(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b } +// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vcleq_u16 return vcleq_u16(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h } +// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vcleq_u32 return vcleq_u32(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <2 x i64> @test_vcleq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vcleq_s64 return vcleq_s64(v1, v2); - // CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <2 x i64> @test_vcleq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vcleq_u64 return vcleq_u64(v1, v2); - // CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <2 x i64> @test_vcleq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcleq_f64 return vcleq_f64(v1, v2); - // CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vcgt_s8 return vcgt_s8(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vcgt_s16 return vcgt_s16(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vcgt_s32 return vcgt_s32(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vcgt_s64 return vcgt_s64(a, b); - // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vcgt_u64 return vcgt_u64(a, b); - // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vcgt_f32 return vcgt_f32(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vcgt_f64 return vcgt_f64(a, b); - // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vcgt_u8 return vcgt_u8(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vcgt_u16 return vcgt_u16(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vcgt_u32 return vcgt_u32(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vcgtq_s8 return vcgtq_s8(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vcgtq_s16 return vcgtq_s16(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vcgtq_s32 return vcgtq_s32(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcgtq_f32 return vcgtq_f32(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vcgtq_u8 return vcgtq_u8(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vcgtq_u16 return vcgtq_u16(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vcgtq_u32 return vcgtq_u32(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcgtq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vcgtq_s64 return vcgtq_s64(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vcgtq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vcgtq_u64 return vcgtq_u64(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vcgtq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcgtq_f64 return vcgtq_f64(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } @@ -1632,10112 +2529,20029 @@ // LT condition predicate implemented as GT, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) { - // CHECK-LABEL: test_vclt_s8 return vclt_s8(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b } +// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) { - // CHECK-LABEL: test_vclt_s16 return vclt_s16(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h } +// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) { - // CHECK-LABEL: test_vclt_s32 return vclt_s32(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vclt_s64 return vclt_s64(a, b); - // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vclt_u64 return vclt_u64(a, b); - // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %v1, <2 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) { - // CHECK-LABEL: test_vclt_f32 return vclt_f32(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vclt_f64 return vclt_f64(a, b); - // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) { - // CHECK-LABEL: test_vclt_u8 return vclt_u8(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b } +// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) { - // CHECK-LABEL: test_vclt_u16 return vclt_u16(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h } +// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) { - // CHECK-LABEL: test_vclt_u32 return vclt_u32(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s } +// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) { - // CHECK-LABEL: test_vcltq_s8 return vcltq_s8(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b } +// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) { - // CHECK-LABEL: test_vcltq_s16 return vcltq_s16(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h } +// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) { - // CHECK-LABEL: test_vcltq_s32 return vcltq_s32(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %v1, <4 x float> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) { - // CHECK-LABEL: test_vcltq_f32 return vcltq_f32(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) { - // CHECK-LABEL: test_vcltq_u8 return vcltq_u8(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b } +// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) { - // CHECK-LABEL: test_vcltq_u16 return vcltq_u16(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h } +// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) { - // CHECK-LABEL: test_vcltq_u32 return vcltq_u32(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s } +// CHECK-LABEL: define <2 x i64> @test_vcltq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) { - // CHECK-LABEL: test_vcltq_s64 return vcltq_s64(v1, v2); - // CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <2 x i64> @test_vcltq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) { - // CHECK-LABEL: test_vcltq_u64 return vcltq_u64(v1, v2); - // CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <2 x i64> @test_vcltq_f64(<2 x double> %v1, <2 x double> %v2) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2 +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) { - // CHECK-LABEL: test_vcltq_f64 return vcltq_f64(v1, v2); - // CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d } +// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VHADD_V_I]] int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) { -// CHECK-LABEL: test_vhadd_s8 return vhadd_s8(v1, v2); - // CHECK: shadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) { -// CHECK-LABEL: test_vhadd_s16 return vhadd_s16(v1, v2); - // CHECK: shadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) { -// CHECK-LABEL: test_vhadd_s32 return vhadd_s32(v1, v2); - // CHECK: shadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VHADD_V_I]] uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) { -// CHECK-LABEL: test_vhadd_u8 return vhadd_u8(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) { -// CHECK-LABEL: test_vhadd_u16 return vhadd_u16(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) { -// CHECK-LABEL: test_vhadd_u32 return vhadd_u32(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VHADDQ_V_I]] int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) { -// CHECK-LABEL: test_vhaddq_s8 return vhaddq_s8(v1, v2); - // CHECK: shadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) { -// CHECK-LABEL: test_vhaddq_s16 return vhaddq_s16(v1, v2); - // CHECK: shadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) { -// CHECK-LABEL: test_vhaddq_s32 return vhaddq_s32(v1, v2); - // CHECK: shadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VHADDQ_V_I]] uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) { -// CHECK-LABEL: test_vhaddq_u8 return vhaddq_u8(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) { -// CHECK-LABEL: test_vhaddq_u16 return vhaddq_u16(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) { -// CHECK-LABEL: test_vhaddq_u32 return vhaddq_u32(v1, v2); - // CHECK: uhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VHSUB_V_I]] int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) { -// CHECK-LABEL: test_vhsub_s8 return vhsub_s8(v1, v2); - // CHECK: shsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) { -// CHECK-LABEL: test_vhsub_s16 return vhsub_s16(v1, v2); - // CHECK: shsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) { -// CHECK-LABEL: test_vhsub_s32 return vhsub_s32(v1, v2); - // CHECK: shsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VHSUB_V_I]] uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) { -// CHECK-LABEL: test_vhsub_u8 return vhsub_u8(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) { -// CHECK-LABEL: test_vhsub_u16 return vhsub_u16(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) { -// CHECK-LABEL: test_vhsub_u32 return vhsub_u32(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) { -// CHECK-LABEL: test_vhsubq_s8 return vhsubq_s8(v1, v2); - // CHECK: shsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) { -// CHECK-LABEL: test_vhsubq_s16 return vhsubq_s16(v1, v2); - // CHECK: shsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) { -// CHECK-LABEL: test_vhsubq_s32 return vhsubq_s32(v1, v2); - // CHECK: shsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) { -// CHECK-LABEL: test_vhsubq_u8 return vhsubq_u8(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) { -// CHECK-LABEL: test_vhsubq_u16 return vhsubq_u16(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) { -// CHECK-LABEL: test_vhsubq_u32 return vhsubq_u32(v1, v2); - // CHECK: uhsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VRHADD_V_I]] int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) { -// CHECK-LABEL: test_vrhadd_s8 return vrhadd_s8(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) { -// CHECK-LABEL: test_vrhadd_s16 return vrhadd_s16(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) { -// CHECK-LABEL: test_vrhadd_s32 return vrhadd_s32(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 { +// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4 +// CHECK: ret <8 x i8> [[VRHADD_V_I]] uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) { -// CHECK-LABEL: test_vrhadd_u8 return vrhadd_u8(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) { -// CHECK-LABEL: test_vrhadd_u16 return vrhadd_u16(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) { -// CHECK-LABEL: test_vrhadd_u32 return vrhadd_u32(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) { -// CHECK-LABEL: test_vrhaddq_s8 return vrhaddq_s8(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) { -// CHECK-LABEL: test_vrhaddq_s16 return vrhaddq_s16(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) { -// CHECK-LABEL: test_vrhaddq_s32 return vrhaddq_s32(v1, v2); -// CHECK: srhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 { +// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4 +// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) { -// CHECK-LABEL: test_vrhaddq_u8 return vrhaddq_u8(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) { -// CHECK-LABEL: test_vrhaddq_u16 return vrhaddq_u16(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) { -// CHECK-LABEL: test_vrhaddq_u32 return vrhaddq_u32(v1, v2); -// CHECK: urhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQADD_V_I]] int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqadd_s8 return vqadd_s8(a, b); - // CHECK: sqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqadd_s16 return vqadd_s16(a, b); - // CHECK: sqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqadd_s32 return vqadd_s32(a, b); - // CHECK: sqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqadd_s64 return vqadd_s64(a, b); -// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQADD_V_I]] uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vqadd_u8 return vqadd_u8(a, b); - // CHECK: uqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vqadd_u16 return vqadd_u16(a, b); - // CHECK: uqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vqadd_u32 return vqadd_u32(a, b); - // CHECK: uqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { -// CHECK: test_vqadd_u64 return vqadd_u64(a, b); -// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQADDQ_V_I]] int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vqaddq_s8 return vqaddq_s8(a, b); - // CHECK: sqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqaddq_s16 return vqaddq_s16(a, b); - // CHECK: sqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqaddq_s32 return vqaddq_s32(a, b); - // CHECK: sqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqaddq_s64 return vqaddq_s64(a, b); -// CHECK: sqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQADDQ_V_I]] uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vqaddq_u8 return vqaddq_u8(a, b); - // CHECK: uqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vqaddq_u16 return vqaddq_u16(a, b); - // CHECK: uqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vqaddq_u32 return vqaddq_u32(a, b); - // CHECK: uqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { -// CHECK-LABEL: test_vqaddq_u64 return vqaddq_u64(a, b); -// CHECK: uqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSUB_V_I]] int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqsub_s8 return vqsub_s8(a, b); - // CHECK: sqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqsub_s16 return vqsub_s16(a, b); - // CHECK: sqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqsub_s32 return vqsub_s32(a, b); - // CHECK: sqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqsub_s64 return vqsub_s64(a, b); -// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSUB_V_I]] uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vqsub_u8 return vqsub_u8(a, b); - // CHECK: uqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vqsub_u16 return vqsub_u16(a, b); - // CHECK: uqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vqsub_u32 return vqsub_u32(a, b); - // CHECK: uqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { -// CHECK-LABEL: test_vqsub_u64 return vqsub_u64(a, b); -// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vqsubq_s8 return vqsubq_s8(a, b); - // CHECK: sqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqsubq_s16 return vqsubq_s16(a, b); - // CHECK: sqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqsubq_s32 return vqsubq_s32(a, b); - // CHECK: sqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqsubq_s64 return vqsubq_s64(a, b); -// CHECK: sqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vqsubq_u8 return vqsubq_u8(a, b); - // CHECK: uqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vqsubq_u16 return vqsubq_u16(a, b); - // CHECK: uqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vqsubq_u32 return vqsubq_u32(a, b); - // CHECK: uqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { -// CHECK-LABEL: test_vqsubq_u64 return vqsubq_u64(a, b); - // CHECK: uqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VSHL_V_I]] int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vshl_s8 return vshl_s8(a, b); -// CHECK: sshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vshl_s16 return vshl_s16(a, b); -// CHECK: sshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vshl_s32 return vshl_s32(a, b); -// CHECK: sshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vshl_s64 return vshl_s64(a, b); -// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VSHL_V_I]] uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vshl_u8 return vshl_u8(a, b); -// CHECK: ushl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vshl_u16 return vshl_u16(a, b); -// CHECK: ushl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vshl_u32 return vshl_u32(a, b); -// CHECK: ushl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vshl_u64 return vshl_u64(a, b); -// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VSHLQ_V_I]] int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vshlq_s8 return vshlq_s8(a, b); -// CHECK: sshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vshlq_s16 return vshlq_s16(a, b); -// CHECK: sshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vshlq_s32 return vshlq_s32(a, b); -// CHECK: sshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vshlq_s64 return vshlq_s64(a, b); -// CHECK: sshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VSHLQ_V_I]] uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vshlq_u8 return vshlq_u8(a, b); -// CHECK: ushl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vshlq_u16 return vshlq_u16(a, b); -// CHECK: ushl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vshlq_u32 return vshlq_u32(a, b); -// CHECK: ushl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vshlq_u64 return vshlq_u64(a, b); -// CHECK: ushl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSHL_V_I]] int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqshl_s8 return vqshl_s8(a, b); -// CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqshl_s16 return vqshl_s16(a, b); -// CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqshl_s32 return vqshl_s32(a, b); -// CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqshl_s64 return vqshl_s64(a, b); -// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSHL_V_I]] uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqshl_u8 return vqshl_u8(a, b); -// CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqshl_u16 return vqshl_u16(a, b); -// CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqshl_u32 return vqshl_u32(a, b); -// CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqshl_u64 return vqshl_u64(a, b); -// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vqshlq_s8 return vqshlq_s8(a, b); -// CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqshlq_s16 return vqshlq_s16(a, b); -// CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqshlq_s32 return vqshlq_s32(a, b); -// CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqshlq_s64 return vqshlq_s64(a, b); -// CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vqshlq_u8 return vqshlq_u8(a, b); -// CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqshlq_u16 return vqshlq_u16(a, b); -// CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqshlq_u32 return vqshlq_u32(a, b); -// CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqshlq_u64 return vqshlq_u64(a, b); -// CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRSHL_V_I]] int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vrshl_s8 return vrshl_s8(a, b); -// CHECK: srshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vrshl_s16 return vrshl_s16(a, b); -// CHECK: srshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vrshl_s32 return vrshl_s32(a, b); -// CHECK: srshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vrshl_s64 return vrshl_s64(a, b); -// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRSHL_V_I]] uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vrshl_u8 return vrshl_u8(a, b); -// CHECK: urshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vrshl_u16 return vrshl_u16(a, b); -// CHECK: urshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vrshl_u32 return vrshl_u32(a, b); -// CHECK: urshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vrshl_u64 return vrshl_u64(a, b); -// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vrshlq_s8 return vrshlq_s8(a, b); -// CHECK: srshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vrshlq_s16 return vrshlq_s16(a, b); -// CHECK: srshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vrshlq_s32 return vrshlq_s32(a, b); -// CHECK: srshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vrshlq_s64 return vrshlq_s64(a, b); -// CHECK: srshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vrshlq_u8 return vrshlq_u8(a, b); -// CHECK: urshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vrshlq_u16 return vrshlq_u16(a, b); -// CHECK: urshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vrshlq_u32 return vrshlq_u32(a, b); -// CHECK: urshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vrshlq_u64 return vrshlq_u64(a, b); -// CHECK: urshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQRSHL_V_I]] int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqrshl_s8 return vqrshl_s8(a, b); -// CHECK: sqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqrshl_s16 return vqrshl_s16(a, b); -// CHECK: sqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqrshl_s32 return vqrshl_s32(a, b); -// CHECK: sqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqrshl_s64 return vqrshl_s64(a, b); -// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQRSHL_V_I]] uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vqrshl_u8 return vqrshl_u8(a, b); -// CHECK: uqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqrshl_u16 return vqrshl_u16(a, b); -// CHECK: uqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqrshl_u32 return vqrshl_u32(a, b); -// CHECK: uqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vqrshl_u64 return vqrshl_u64(a, b); -// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vqrshlq_s8 return vqrshlq_s8(a, b); -// CHECK: sqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqrshlq_s16 return vqrshlq_s16(a, b); -// CHECK: sqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqrshlq_s32 return vqrshlq_s32(a, b); -// CHECK: sqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqrshlq_s64 return vqrshlq_s64(a, b); -// CHECK: sqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vqrshlq_u8 +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); -// CHECK: uqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqrshlq_u16 return vqrshlq_u16(a, b); -// CHECK: uqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqrshlq_u32 return vqrshlq_u32(a, b); -// CHECK: uqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { -// CHECK-LABEL: test_vqrshlq_u64 return vqrshlq_u64(a, b); -// CHECK: uqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 0) +// CHECK: ret <1 x i64> [[VSLI_N]]2 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) { -// CHECK-LABEL: test_vsli_n_p64 return vsli_n_p64(a, b, 0); -// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0 } +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 0) +// CHECK: ret <2 x i64> [[VSLI_N]]2 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) { -// CHECK-LABEL: test_vsliq_n_p64 return vsliq_n_p64(a, b, 0); -// CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 } +// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMAX_I]] int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vmax_s8 return vmax_s8(a, b); -// CHECK: smax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VMAX2_I]] int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vmax_s16 return vmax_s16(a, b); -// CHECK: smax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VMAX2_I]] int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vmax_s32 return vmax_s32(a, b); -// CHECK: smax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMAX_I]] uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vmax_u8 return vmax_u8(a, b); -// CHECK: umax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VMAX2_I]] uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vmax_u16 return vmax_u16(a, b); -// CHECK: umax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VMAX2_I]] uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vmax_u32 return vmax_u32(a, b); -// CHECK: umax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4 +// CHECK: ret <2 x float> [[VMAX2_I]] float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vmax_f32 return vmax_f32(a, b); -// CHECK: fmax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMAX_I]] int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vmaxq_s8 return vmaxq_s8(a, b); -// CHECK: smax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VMAX2_I]] int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vmaxq_s16 return vmaxq_s16(a, b); -// CHECK: smax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VMAX2_I]] int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vmaxq_s32 return vmaxq_s32(a, b); -// CHECK: smax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMAX_I]] uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vmaxq_u8 return vmaxq_u8(a, b); -// CHECK: umax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VMAX2_I]] uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vmaxq_u16 return vmaxq_u16(a, b); -// CHECK: umax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VMAX2_I]] uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vmaxq_u32 return vmaxq_u32(a, b); -// CHECK: umax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4 +// CHECK: ret <4 x float> [[VMAX2_I]] float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vmaxq_f32 return vmaxq_f32(a, b); -// CHECK: fmax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmaxq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) #4 +// CHECK: ret <2 x double> [[VMAX2_I]] float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vmaxq_f64 return vmaxq_f64(a, b); -// CHECK: fmax {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMIN_I]] int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vmin_s8 return vmin_s8(a, b); -// CHECK: smin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VMIN2_I]] int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vmin_s16 return vmin_s16(a, b); -// CHECK: smin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VMIN2_I]] int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vmin_s32 return vmin_s32(a, b); -// CHECK: smin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMIN_I]] uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vmin_u8 return vmin_u8(a, b); -// CHECK: umin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VMIN2_I]] uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vmin_u16 return vmin_u16(a, b); -// CHECK: umin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VMIN2_I]] uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vmin_u32 return vmin_u32(a, b); -// CHECK: umin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4 +// CHECK: ret <2 x float> [[VMIN2_I]] float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vmin_f32 return vmin_f32(a, b); -// CHECK: fmin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMIN_I]] int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vminq_s8 return vminq_s8(a, b); -// CHECK: smin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VMIN2_I]] int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vminq_s16 return vminq_s16(a, b); -// CHECK: smin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VMIN2_I]] int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vminq_s32 return vminq_s32(a, b); -// CHECK: smin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMIN_I]] uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vminq_u8 return vminq_u8(a, b); -// CHECK: umin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VMIN2_I]] uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vminq_u16 return vminq_u16(a, b); -// CHECK: umin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VMIN2_I]] uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vminq_u32 return vminq_u32(a, b); -// CHECK: umin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4 +// CHECK: ret <4 x float> [[VMIN2_I]] float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vminq_f32 return vminq_f32(a, b); -// CHECK: fmin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vminq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) #4 +// CHECK: ret <2 x double> [[VMIN2_I]] float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vminq_f64 return vminq_f64(a, b); -// CHECK: fmin {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) #4 +// CHECK: ret <2 x float> [[VMAXNM2_I]] float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vmaxnm_f32 return vmaxnm_f32(a, b); -// CHECK: fmaxnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) #4 +// CHECK: ret <4 x float> [[VMAXNM2_I]] float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vmaxnmq_f32 return vmaxnmq_f32(a, b); -// CHECK: fmaxnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) #4 +// CHECK: ret <2 x double> [[VMAXNM2_I]] float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vmaxnmq_f64 return vmaxnmq_f64(a, b); -// CHECK: fmaxnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) #4 +// CHECK: ret <2 x float> [[VMINNM2_I]] float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vminnm_f32 return vminnm_f32(a, b); -// CHECK: fminnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) #4 +// CHECK: ret <4 x float> [[VMINNM2_I]] float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vminnmq_f32 return vminnmq_f32(a, b); -// CHECK: fminnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vminnmq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) #4 +// CHECK: ret <2 x double> [[VMINNM2_I]] float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vminnmq_f64 return vminnmq_f64(a, b); -// CHECK: fminnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMAX_I]] int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vpmax_s8 return vpmax_s8(a, b); -// CHECK: smaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMAX2_I]] int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vpmax_s16 return vpmax_s16(a, b); -// CHECK: smaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMAX2_I]] int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vpmax_s32 return vpmax_s32(a, b); -// CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMAX_I]] uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vpmax_u8 return vpmax_u8(a, b); -// CHECK: umaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMAX2_I]] uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vpmax_u16 return vpmax_u16(a, b); -// CHECK: umaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMAX2_I]] uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vpmax_u32 return vpmax_u32(a, b); -// CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x float> [[VPMAX2_I]] float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vpmax_f32 return vpmax_f32(a, b); -// CHECK: fmaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vpmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPMAX_I]] int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vpmaxq_s8 return vpmaxq_s8(a, b); -// CHECK: smaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VPMAX2_I]] int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vpmaxq_s16 return vpmaxq_s16(a, b); -// CHECK: smaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VPMAX2_I]] int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vpmaxq_s32 return vpmaxq_s32(a, b); -// CHECK: smaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vpmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPMAX_I]] uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vpmaxq_u8 return vpmaxq_u8(a, b); -// CHECK: umaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VPMAX2_I]] uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vpmaxq_u16 return vpmaxq_u16(a, b); -// CHECK: umaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VPMAX2_I]] uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vpmaxq_u32 return vpmaxq_u32(a, b); -// CHECK: umaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vpmaxq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x float> [[VPMAX2_I]] float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vpmaxq_f32 return vpmaxq_f32(a, b); -// CHECK: fmaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vpmaxq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x double> [[VPMAX2_I]] float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vpmaxq_f64 return vpmaxq_f64(a, b); -// CHECK: fmaxp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMIN_I]] int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vpmin_s8 return vpmin_s8(a, b); -// CHECK: sminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMIN2_I]] int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vpmin_s16 return vpmin_s16(a, b); -// CHECK: sminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMIN2_I]] int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vpmin_s32 return vpmin_s32(a, b); -// CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMIN_I]] uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vpmin_u8 return vpmin_u8(a, b); -// CHECK: uminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMIN2_I]] uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vpmin_u16 return vpmin_u16(a, b); -// CHECK: uminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMIN2_I]] uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vpmin_u32 return vpmin_u32(a, b); -// CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x float> [[VPMIN2_I]] float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vpmin_f32 return vpmin_f32(a, b); -// CHECK: fminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vpminq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPMIN_I]] int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vpminq_s8 return vpminq_s8(a, b); -// CHECK: sminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpminq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VPMIN2_I]] int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vpminq_s16 return vpminq_s16(a, b); -// CHECK: sminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpminq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VPMIN2_I]] int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vpminq_s32 return vpminq_s32(a, b); -// CHECK: sminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vpminq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPMIN_I]] uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vpminq_u8 return vpminq_u8(a, b); -// CHECK: uminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpminq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VPMIN2_I]] uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vpminq_u16 return vpminq_u16(a, b); -// CHECK: uminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpminq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VPMIN2_I]] uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vpminq_u32 return vpminq_u32(a, b); -// CHECK: uminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vpminq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x float> [[VPMIN2_I]] float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vpminq_f32 return vpminq_f32(a, b); -// CHECK: fminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vpminq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x double> [[VPMIN2_I]] float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vpminq_f64 return vpminq_f64(a, b); -// CHECK: fminp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vpmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) #4 +// CHECK: ret <2 x float> [[VPMAXNM2_I]] float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vpmaxnm_f32 return vpmaxnm_f32(a, b); -// CHECK: fmaxnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vpmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) #4 +// CHECK: ret <4 x float> [[VPMAXNM2_I]] float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vpmaxnmq_f32 return vpmaxnmq_f32(a, b); -// CHECK: fmaxnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vpmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) #4 +// CHECK: ret <2 x double> [[VPMAXNM2_I]] float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vpmaxnmq_f64 return vpmaxnmq_f64(a, b); -// CHECK: fmaxnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vpminnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) #4 +// CHECK: ret <2 x float> [[VPMINNM2_I]] float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vpminnm_f32 return vpminnm_f32(a, b); -// CHECK: fminnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vpminnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) #4 +// CHECK: ret <4 x float> [[VPMINNM2_I]] float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vpminnmq_f32 return vpminnmq_f32(a, b); -// CHECK: fminnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vpminnmq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) #4 +// CHECK: ret <2 x double> [[VPMINNM2_I]] float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vpminnmq_f64 return vpminnmq_f64(a, b); -// CHECK: fminnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPADD_V_I]] int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { -// CHECK-LABEL: test_vpadd_s8 return vpadd_s8(a, b); -// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vpadd_s16 return vpadd_s16(a, b); -// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vpadd_s32 return vpadd_s32(a, b); -// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPADD_V_I]] uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { -// CHECK-LABEL: test_vpadd_u8 return vpadd_u8(a, b); -// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { -// CHECK-LABEL: test_vpadd_u16 return vpadd_u16(a, b); -// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { -// CHECK-LABEL: test_vpadd_u32 return vpadd_u32(a, b); -// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vpadd_f32 return vpadd_f32(a, b); -// CHECK: faddp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <16 x i8> @test_vpaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPADDQ_V_I]] int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) { -// CHECK-LABEL: test_vpaddq_s8 return vpaddq_s8(a, b); -// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vpaddq_s16 return vpaddq_s16(a, b); -// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vpaddq_s32 return vpaddq_s32(a, b); -// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <16 x i8> @test_vpaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VPADDQ_V_I]] uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) { -// CHECK-LABEL: test_vpaddq_u8 return vpaddq_u8(a, b); -// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i16> @test_vpaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) { -// CHECK-LABEL: test_vpaddq_u16 return vpaddq_u16(a, b); -// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vpaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) { -// CHECK-LABEL: test_vpaddq_u32 return vpaddq_u32(a, b); -// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vpaddq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vpaddq_f32 return vpaddq_f32(a, b); -// CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vpaddq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x double> +// CHECK: ret <2 x double> [[TMP2]] float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vpaddq_f64 return vpaddq_f64(a, b); -// CHECK: faddp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqdmulh_s16 return vqdmulh_s16(a, b); -// CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqdmulh_s32 return vqdmulh_s32(a, b); -// CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqdmulhq_s16 return vqdmulhq_s16(a, b); -// CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqdmulhq_s32 return vqdmulhq_s32(a, b); -// CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { -// CHECK-LABEL: test_vqrdmulh_s16 return vqrdmulh_s16(a, b); -// CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { -// CHECK-LABEL: test_vqrdmulh_s32 return vqrdmulh_s32(a, b); -// CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { -// CHECK-LABEL: test_vqrdmulhq_s16 return vqrdmulhq_s16(a, b); -// CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { -// CHECK-LABEL: test_vqrdmulhq_s32 return vqrdmulhq_s32(a, b); -// CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vmulx_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #4 +// CHECK: ret <2 x float> [[VMULX2_I]] float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) { -// CHECK-LABEL: test_vmulx_f32 return vmulx_f32(a, b); -// CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vmulxq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #4 +// CHECK: ret <4 x float> [[VMULX2_I]] float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) { -// CHECK-LABEL: test_vmulxq_f32 return vmulxq_f32(a, b); -// CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vmulxq_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #4 +// CHECK: ret <2 x double> [[VMULX2_I]] float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) { -// CHECK-LABEL: test_vmulxq_f64 return vmulxq_f64(a, b); -// CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHL_N]] int8x8_t test_vshl_n_s8(int8x8_t a) { -// CHECK-LABEL: test_vshl_n_s8 return vshl_n_s8(a, 3); -// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHL_N]] int16x4_t test_vshl_n_s16(int16x4_t a) { -// CHECK-LABEL: test_vshl_n_s16 return vshl_n_s16(a, 3); -// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHL_N]] int32x2_t test_vshl_n_s32(int32x2_t a) { -// CHECK-LABEL: test_vshl_n_s32 return vshl_n_s32(a, 3); -// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHL_N]] int8x16_t test_vshlq_n_s8(int8x16_t a) { -// CHECK-LABEL: test_vshlq_n_s8 return vshlq_n_s8(a, 3); -// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHL_N]] int16x8_t test_vshlq_n_s16(int16x8_t a) { -// CHECK-LABEL: test_vshlq_n_s16 return vshlq_n_s16(a, 3); -// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHL_N]] int32x4_t test_vshlq_n_s32(int32x4_t a) { -// CHECK-LABEL: test_vshlq_n_s32 return vshlq_n_s32(a, 3); -// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHL_N]] int64x2_t test_vshlq_n_s64(int64x2_t a) { -// CHECK-LABEL: test_vshlq_n_s64 return vshlq_n_s64(a, 3); -// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHL_N]] int8x8_t test_vshl_n_u8(int8x8_t a) { -// CHECK-LABEL: test_vshl_n_u8 return vshl_n_u8(a, 3); -// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHL_N]] int16x4_t test_vshl_n_u16(int16x4_t a) { -// CHECK-LABEL: test_vshl_n_u16 return vshl_n_u16(a, 3); -// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHL_N]] int32x2_t test_vshl_n_u32(int32x2_t a) { -// CHECK-LABEL: test_vshl_n_u32 return vshl_n_u32(a, 3); -// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHL_N]] int8x16_t test_vshlq_n_u8(int8x16_t a) { -// CHECK-LABEL: test_vshlq_n_u8 return vshlq_n_u8(a, 3); -// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHL_N]] int16x8_t test_vshlq_n_u16(int16x8_t a) { -// CHECK-LABEL: test_vshlq_n_u16 return vshlq_n_u16(a, 3); -// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHL_N]] int32x4_t test_vshlq_n_u32(int32x4_t a) { -// CHECK-LABEL: test_vshlq_n_u32 return vshlq_n_u32(a, 3); -// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHL_N]] int64x2_t test_vshlq_n_u64(int64x2_t a) { -// CHECK-LABEL: test_vshlq_n_u64 return vshlq_n_u64(a, 3); -// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHR_N]] int8x8_t test_vshr_n_s8(int8x8_t a) { - // CHECK-LABEL: test_vshr_n_s8 return vshr_n_s8(a, 3); - // CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHR_N]] int16x4_t test_vshr_n_s16(int16x4_t a) { - // CHECK-LABEL: test_vshr_n_s16 return vshr_n_s16(a, 3); - // CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHR_N]] int32x2_t test_vshr_n_s32(int32x2_t a) { - // CHECK-LABEL: test_vshr_n_s32 return vshr_n_s32(a, 3); - // CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHR_N]] int8x16_t test_vshrq_n_s8(int8x16_t a) { - // CHECK-LABEL: test_vshrq_n_s8 return vshrq_n_s8(a, 3); - // CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHR_N]] int16x8_t test_vshrq_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vshrq_n_s16 return vshrq_n_s16(a, 3); - // CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHR_N]] int32x4_t test_vshrq_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vshrq_n_s32 return vshrq_n_s32(a, 3); - // CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHR_N]] int64x2_t test_vshrq_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vshrq_n_s64 return vshrq_n_s64(a, 3); - // CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHR_N]] int8x8_t test_vshr_n_u8(int8x8_t a) { - // CHECK-LABEL: test_vshr_n_u8 return vshr_n_u8(a, 3); - // CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHR_N]] int16x4_t test_vshr_n_u16(int16x4_t a) { - // CHECK-LABEL: test_vshr_n_u16 return vshr_n_u16(a, 3); - // CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHR_N]] int32x2_t test_vshr_n_u32(int32x2_t a) { - // CHECK-LABEL: test_vshr_n_u32 return vshr_n_u32(a, 3); - // CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHR_N]] int8x16_t test_vshrq_n_u8(int8x16_t a) { - // CHECK-LABEL: test_vshrq_n_u8 return vshrq_n_u8(a, 3); - // CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHR_N]] int16x8_t test_vshrq_n_u16(int16x8_t a) { - // CHECK-LABEL: test_vshrq_n_u16 return vshrq_n_u16(a, 3); - // CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHR_N]] int32x4_t test_vshrq_n_u32(int32x4_t a) { - // CHECK-LABEL: test_vshrq_n_u32 return vshrq_n_u32(a, 3); - // CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHR_N]] int64x2_t test_vshrq_n_u64(int64x2_t a) { - // CHECK-LABEL: test_vshrq_n_u64 return vshrq_n_u64(a, 3); - // CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsra_n_s8 return vsra_n_s8(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i16> [[TMP4]] int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsra_n_s16 return vsra_n_s16(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i32> [[TMP4]] int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsra_n_s32 return vsra_n_s32(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsraq_n_s8 return vsraq_n_s8(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <8 x i16> [[TMP4]] int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsraq_n_s16 return vsraq_n_s16(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i32> [[TMP4]] int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsraq_n_s32 return vsraq_n_s32(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i64> [[TMP4]] int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsraq_n_s64 return vsraq_n_s64(a, b, 3); - // CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsra_n_u8 return vsra_n_u8(a, b, 3); - // CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i16> [[TMP4]] int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsra_n_u16 return vsra_n_u16(a, b, 3); - // CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i32> [[TMP4]] int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsra_n_u32 return vsra_n_u32(a, b, 3); - // CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsraq_n_u8 return vsraq_n_u8(a, b, 3); - // CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <8 x i16> [[TMP4]] int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsraq_n_u16 return vsraq_n_u16(a, b, 3); - // CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i32> [[TMP4]] int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsraq_n_u32 return vsraq_n_u32(a, b, 3); - // CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i64> [[TMP4]] int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsraq_n_u64 return vsraq_n_u64(a, b, 3); - // CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VRSHR_N]] int8x8_t test_vrshr_n_s8(int8x8_t a) { - // CHECK-LABEL: test_vrshr_n_s8 return vrshr_n_s8(a, 3); - // CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VRSHR_N]]1 int16x4_t test_vrshr_n_s16(int16x4_t a) { - // CHECK-LABEL: test_vrshr_n_s16 return vrshr_n_s16(a, 3); - // CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VRSHR_N]]1 int32x2_t test_vrshr_n_s32(int32x2_t a) { - // CHECK-LABEL: test_vrshr_n_s32 return vrshr_n_s32(a, 3); - // CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VRSHR_N]] int8x16_t test_vrshrq_n_s8(int8x16_t a) { - // CHECK-LABEL: test_vrshrq_n_s8 return vrshrq_n_s8(a, 3); - // CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VRSHR_N]]1 int16x8_t test_vrshrq_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vrshrq_n_s16 return vrshrq_n_s16(a, 3); - // CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VRSHR_N]]1 int32x4_t test_vrshrq_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vrshrq_n_s32 return vrshrq_n_s32(a, 3); - // CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VRSHR_N]]1 int64x2_t test_vrshrq_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vrshrq_n_s64 return vrshrq_n_s64(a, 3); - // CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VRSHR_N]] int8x8_t test_vrshr_n_u8(int8x8_t a) { - // CHECK-LABEL: test_vrshr_n_u8 return vrshr_n_u8(a, 3); - // CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VRSHR_N]]1 int16x4_t test_vrshr_n_u16(int16x4_t a) { - // CHECK-LABEL: test_vrshr_n_u16 return vrshr_n_u16(a, 3); - // CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VRSHR_N]]1 int32x2_t test_vrshr_n_u32(int32x2_t a) { - // CHECK-LABEL: test_vrshr_n_u32 return vrshr_n_u32(a, 3); - // CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VRSHR_N]] int8x16_t test_vrshrq_n_u8(int8x16_t a) { - // CHECK-LABEL: test_vrshrq_n_u8 return vrshrq_n_u8(a, 3); - // CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VRSHR_N]]1 int16x8_t test_vrshrq_n_u16(int16x8_t a) { - // CHECK-LABEL: test_vrshrq_n_u16 return vrshrq_n_u16(a, 3); - // CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VRSHR_N]]1 int32x4_t test_vrshrq_n_u32(int32x4_t a) { - // CHECK-LABEL: test_vrshrq_n_u32 return vrshrq_n_u32(a, 3); - // CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VRSHR_N]]1 int64x2_t test_vrshrq_n_u64(int64x2_t a) { - // CHECK-LABEL: test_vrshrq_n_u64 return vrshrq_n_u64(a, 3); - // CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> ) +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vrsra_n_s8 return vrsra_n_s8(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i16> [[TMP3]] int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vrsra_n_s16 return vrsra_n_s16(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i32> [[TMP3]] int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vrsra_n_s32 return vrsra_n_s32(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> ) +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vrsraq_n_s8 return vrsraq_n_s8(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <8 x i16> [[TMP3]] int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vrsraq_n_s16 return vrsraq_n_s16(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i32> [[TMP3]] int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vrsraq_n_s32 return vrsraq_n_s32(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i64> [[TMP3]] int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vrsraq_n_s64 return vrsraq_n_s64(a, b, 3); - // CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> ) +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vrsra_n_u8 return vrsra_n_u8(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i16> [[TMP3]] int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vrsra_n_u16 return vrsra_n_u16(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i32> [[TMP3]] int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vrsra_n_u32 return vrsra_n_u32(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> ) +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vrsraq_n_u8 return vrsraq_n_u8(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <8 x i16> [[TMP3]] int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vrsraq_n_u16 return vrsraq_n_u16(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i32> [[TMP3]] int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vrsraq_n_u32 return vrsraq_n_u32(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i64> [[TMP3]] int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vrsraq_n_u64 return vrsraq_n_u64(a, b, 3); - // CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSRI_N]] int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsri_n_s8 return vsri_n_s8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 3) +// CHECK: ret <4 x i16> [[VSRI_N]]2 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsri_n_s16 return vsri_n_s16(a, b, 3); - // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 3) +// CHECK: ret <2 x i32> [[VSRI_N]]2 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsri_n_s32 return vsri_n_s32(a, b, 3); - // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSRI_N]] int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsriq_n_s8 return vsriq_n_s8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 3) +// CHECK: ret <8 x i16> [[VSRI_N]]2 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsriq_n_s16 return vsriq_n_s16(a, b, 3); - // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 3) +// CHECK: ret <4 x i32> [[VSRI_N]]2 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsriq_n_s32 return vsriq_n_s32(a, b, 3); - // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 3) +// CHECK: ret <2 x i64> [[VSRI_N]]2 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsriq_n_s64 return vsriq_n_s64(a, b, 3); - // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSRI_N]] int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsri_n_u8 return vsri_n_u8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 3) +// CHECK: ret <4 x i16> [[VSRI_N]]2 int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsri_n_u16 return vsri_n_u16(a, b, 3); - // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 3) +// CHECK: ret <2 x i32> [[VSRI_N]]2 int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsri_n_u32 return vsri_n_u32(a, b, 3); - // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSRI_N]] int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsriq_n_u8 return vsriq_n_u8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 3) +// CHECK: ret <8 x i16> [[VSRI_N]]2 int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsriq_n_u16 return vsriq_n_u16(a, b, 3); - // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 3) +// CHECK: ret <4 x i32> [[VSRI_N]]2 int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsriq_n_u32 return vsriq_n_u32(a, b, 3); - // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 3) +// CHECK: ret <2 x i64> [[VSRI_N]]2 int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsriq_n_u64 return vsriq_n_u64(a, b, 3); - // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSRI_N]] poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vsri_n_p8 return vsri_n_p8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 15) +// CHECK: ret <4 x i16> [[VSRI_N]]2 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vsri_n_p16 return vsri_n_p16(a, b, 15); - // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 } +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSRI_N]] poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vsriq_n_p8 return vsriq_n_p8(a, b, 3); - // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 15) +// CHECK: ret <8 x i16> [[VSRI_N]]2 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vsriq_n_p16 return vsriq_n_p16(a, b, 15); - // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 } +// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSLI_N]] int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsli_n_s8 return vsli_n_s8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 3) +// CHECK: ret <4 x i16> [[VSLI_N]]2 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsli_n_s16 return vsli_n_s16(a, b, 3); - // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 3) +// CHECK: ret <2 x i32> [[VSLI_N]]2 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsli_n_s32 return vsli_n_s32(a, b, 3); - // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSLI_N]] int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsliq_n_s8 return vsliq_n_s8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 3) +// CHECK: ret <8 x i16> [[VSLI_N]]2 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsliq_n_s16 return vsliq_n_s16(a, b, 3); - // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 3) +// CHECK: ret <4 x i32> [[VSLI_N]]2 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsliq_n_s32 return vsliq_n_s32(a, b, 3); - // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 3) +// CHECK: ret <2 x i64> [[VSLI_N]]2 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsliq_n_s64 return vsliq_n_s64(a, b, 3); - // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSLI_N]] uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vsli_n_u8 return vsli_n_u8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 3) +// CHECK: ret <4 x i16> [[VSLI_N]]2 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vsli_n_u16 return vsli_n_u16(a, b, 3); - // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 3) +// CHECK: ret <2 x i32> [[VSLI_N]]2 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vsli_n_u32 return vsli_n_u32(a, b, 3); - // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSLI_N]] uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vsliq_n_u8 return vsliq_n_u8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 3) +// CHECK: ret <8 x i16> [[VSLI_N]]2 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsliq_n_u16 return vsliq_n_u16(a, b, 3); - // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 3) +// CHECK: ret <4 x i32> [[VSLI_N]]2 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsliq_n_u32 return vsliq_n_u32(a, b, 3); - // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 3) +// CHECK: ret <2 x i64> [[VSLI_N]]2 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vsliq_n_u64 return vsliq_n_u64(a, b, 3); - // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) +// CHECK: ret <8 x i8> [[VSLI_N]] poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vsli_n_p8 return vsli_n_p8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 15) +// CHECK: ret <4 x i16> [[VSLI_N]]2 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vsli_n_p16 return vsli_n_p16(a, b, 15); - // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 } +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) +// CHECK: ret <16 x i8> [[VSLI_N]] poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vsliq_n_p8 return vsliq_n_p8(a, b, 3); - // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 15) +// CHECK: ret <8 x i16> [[VSLI_N]]2 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vsliq_n_p16 return vsliq_n_p16(a, b, 15); - // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 } +// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VQSHLU_N]] int8x8_t test_vqshlu_n_s8(int8x8_t a) { - // CHECK-LABEL: test_vqshlu_n_s8 return vqshlu_n_s8(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VQSHLU_N]]1 int16x4_t test_vqshlu_n_s16(int16x4_t a) { - // CHECK-LABEL: test_vqshlu_n_s16 return vqshlu_n_s16(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 } +// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VQSHLU_N]]1 int32x2_t test_vqshlu_n_s32(int32x2_t a) { - // CHECK-LABEL: test_vqshlu_n_s32 return vqshlu_n_s32(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 } +// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VQSHLU_N]] int8x16_t test_vqshluq_n_s8(int8x16_t a) { - // CHECK-LABEL: test_vqshluq_n_s8 return vqshluq_n_s8(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VQSHLU_N]]1 int16x8_t test_vqshluq_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqshluq_n_s16 return vqshluq_n_s16(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VQSHLU_N]]1 int32x4_t test_vqshluq_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqshluq_n_s32 return vqshluq_n_s32(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 } +// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VQSHLU_N]]1 int64x2_t test_vqshluq_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqshluq_n_s64 return vqshluq_n_s64(a, 3); - // CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 } +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSHRN_N]] int8x8_t test_vshrn_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vshrn_n_s16 return vshrn_n_s16(a, 3); - // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSHRN_N]] int16x4_t test_vshrn_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vshrn_n_s32 return vshrn_n_s32(a, 9); - // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSHRN_N]] int32x2_t test_vshrn_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vshrn_n_s64 return vshrn_n_s64(a, 19); - // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSHRN_N]] uint8x8_t test_vshrn_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vshrn_n_u16 return vshrn_n_u16(a, 3); - // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSHRN_N]] uint16x4_t test_vshrn_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vshrn_n_u32 return vshrn_n_u32(a, 9); - // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSHRN_N]] uint32x2_t test_vshrn_n_u64(uint64x2_t a) { - // CHECK-LABEL: test_vshrn_n_u64 return vshrn_n_u64(a, 19); - // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vshrn_high_n_s16 return vshrn_high_n_s16(a, b, 3); - // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vshrn_high_n_s32 return vshrn_high_n_s32(a, b, 9); - // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vshrn_high_n_s64 return vshrn_high_n_s64(a, b, 19); - // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vshrn_high_n_u16 return vshrn_high_n_u16(a, b, 3); - // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vshrn_high_n_u32 return vshrn_high_n_u32(a, b, 9); - // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vshrn_high_n_u64 return vshrn_high_n_u64(a, b, 19); - // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQSHRUN_N]]1 int8x8_t test_vqshrun_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqshrun_n_s16 return vqshrun_n_s16(a, 3); - // CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQSHRUN_N]]1 int16x4_t test_vqshrun_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqshrun_n_s32 return vqshrun_n_s32(a, 9); - // CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQSHRUN_N]]1 int32x2_t test_vqshrun_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqshrun_n_s64 return vqshrun_n_s64(a, 19); - // CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqshrun_high_n_s16 return vqshrun_high_n_s16(a, b, 3); - // CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqshrun_high_n_s32 return vqshrun_high_n_s32(a, b, 9); - // CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqshrun_high_n_s64 return vqshrun_high_n_s64(a, b, 19); - // CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VRSHRN_N]]1 int8x8_t test_vrshrn_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vrshrn_n_s16 return vrshrn_n_s16(a, 3); - // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VRSHRN_N]]1 int16x4_t test_vrshrn_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vrshrn_n_s32 return vrshrn_n_s32(a, 9); - // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VRSHRN_N]]1 int32x2_t test_vrshrn_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vrshrn_n_s64 return vrshrn_n_s64(a, 19); - // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VRSHRN_N]]1 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vrshrn_n_u16 return vrshrn_n_u16(a, 3); - // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VRSHRN_N]]1 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vrshrn_n_u32 return vrshrn_n_u32(a, 9); - // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VRSHRN_N]]1 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { - // CHECK-LABEL: test_vrshrn_n_u64 return vrshrn_n_u64(a, 19); - // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vrshrn_high_n_s16 return vrshrn_high_n_s16(a, b, 3); - // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vrshrn_high_n_s32 return vrshrn_high_n_s32(a, b, 9); - // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vrshrn_high_n_s64 return vrshrn_high_n_s64(a, b, 19); - // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vrshrn_high_n_u16 return vrshrn_high_n_u16(a, b, 3); - // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vrshrn_high_n_u32 return vrshrn_high_n_u32(a, b, 9); - // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vrshrn_high_n_u64 return vrshrn_high_n_u64(a, b, 19); - // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQRSHRUN_N]]1 int8x8_t test_vqrshrun_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqrshrun_n_s16 return vqrshrun_n_s16(a, 3); - // CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQRSHRUN_N]]1 int16x4_t test_vqrshrun_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqrshrun_n_s32 return vqrshrun_n_s32(a, 9); - // CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQRSHRUN_N]]1 int32x2_t test_vqrshrun_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqrshrun_n_s64 return vqrshrun_n_s64(a, 19); - // CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqrshrun_high_n_s16 return vqrshrun_high_n_s16(a, b, 3); - // CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqrshrun_high_n_s32 return vqrshrun_high_n_s32(a, b, 9); - // CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqrshrun_high_n_s64 return vqrshrun_high_n_s64(a, b, 19); - // CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQSHRN_N]]1 int8x8_t test_vqshrn_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqshrn_n_s16 return vqshrn_n_s16(a, 3); - // CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQSHRN_N]]1 int16x4_t test_vqshrn_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqshrn_n_s32 return vqshrn_n_s32(a, 9); - // CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQSHRN_N]]1 int32x2_t test_vqshrn_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqshrn_n_s64 return vqshrn_n_s64(a, 19); - // CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQSHRN_N]]1 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vqshrn_n_u16 return vqshrn_n_u16(a, 3); - // CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQSHRN_N]]1 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vqshrn_n_u32 return vqshrn_n_u32(a, 9); - // CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQSHRN_N]]1 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { - // CHECK-LABEL: test_vqshrn_n_u64 return vqshrn_n_u64(a, 19); - // CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqshrn_high_n_s16 return vqshrn_high_n_s16(a, b, 3); - // CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqshrn_high_n_s32 return vqshrn_high_n_s32(a, b, 9); - // CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqshrn_high_n_s64 return vqshrn_high_n_s64(a, b, 19); - // CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vqshrn_high_n_u16 return vqshrn_high_n_u16(a, b, 3); - // CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vqshrn_high_n_u32 return vqshrn_high_n_u32(a, b, 9); - // CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vqshrn_high_n_u64 return vqshrn_high_n_u64(a, b, 19); - // CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQRSHRN_N]]1 int8x8_t test_vqrshrn_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqrshrn_n_s16 return vqrshrn_n_s16(a, 3); - // CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQRSHRN_N]]1 int16x4_t test_vqrshrn_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqrshrn_n_s32 return vqrshrn_n_s32(a, 9); - // CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQRSHRN_N]]1 int32x2_t test_vqrshrn_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqrshrn_n_s64 return vqrshrn_n_s64(a, 19); - // CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK: ret <8 x i8> [[VQRSHRN_N]]1 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vqrshrn_n_u16 return vqrshrn_n_u16(a, 3); - // CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK: ret <4 x i16> [[VQRSHRN_N]]1 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vqrshrn_n_u32 return vqrshrn_n_u32(a, 9); - // CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK: ret <2 x i32> [[VQRSHRN_N]]1 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { - // CHECK-LABEL: test_vqrshrn_n_u64 return vqrshrn_n_u64(a, 19); - // CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_s16 return vqrshrn_high_n_s16(a, b, 3); - // CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_s32 return vqrshrn_high_n_s32(a, b, 9); - // CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_s64 return vqrshrn_high_n_s64(a, b, 19); - // CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N]]1, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_u16 return vqrshrn_high_n_u16(a, b, 3); - // CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 } +// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N]]1, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_u32 return vqrshrn_high_n_u32(a, b, 9); - // CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 } +// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N]]1, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vqrshrn_high_n_u64 return vqrshrn_high_n_u64(a, b, 19); - // CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 } +// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] int16x8_t test_vshll_n_s8(int8x8_t a) { -// CHECK-LABEL: test_vshll_n_s8 return vshll_n_s8(a, 3); -// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] int32x4_t test_vshll_n_s16(int16x4_t a) { -// CHECK-LABEL: test_vshll_n_s16 return vshll_n_s16(a, 9); -// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9 } +// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] int64x2_t test_vshll_n_s32(int32x2_t a) { -// CHECK-LABEL: test_vshll_n_s32 return vshll_n_s32(a, 19); -// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19 } +// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] uint16x8_t test_vshll_n_u8(uint8x8_t a) { -// CHECK-LABEL: test_vshll_n_u8 return vshll_n_u8(a, 3); -// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] uint32x4_t test_vshll_n_u16(uint16x4_t a) { -// CHECK-LABEL: test_vshll_n_u16 return vshll_n_u16(a, 9); -// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9 } +// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] uint64x2_t test_vshll_n_u32(uint32x2_t a) { -// CHECK-LABEL: test_vshll_n_u32 return vshll_n_u32(a, 19); -// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19 } +// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] int16x8_t test_vshll_high_n_s8(int8x16_t a) { -// CHECK-LABEL: test_vshll_high_n_s8 return vshll_high_n_s8(a, 3); -// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] int32x4_t test_vshll_high_n_s16(int16x8_t a) { -// CHECK-LABEL: test_vshll_high_n_s16 return vshll_high_n_s16(a, 9); -// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9 } +// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] int64x2_t test_vshll_high_n_s32(int32x4_t a) { -// CHECK-LABEL: test_vshll_high_n_s32 return vshll_high_n_s32(a, 19); -// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19 } +// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { -// CHECK-LABEL: test_vshll_high_n_u8 return vshll_high_n_u8(a, 3); -// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3 } +// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { -// CHECK-LABEL: test_vshll_high_n_u16 return vshll_high_n_u16(a, 9); -// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9 } +// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { -// CHECK-LABEL: test_vshll_high_n_u32 return vshll_high_n_u32(a, 19); -// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19 } +// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 { +// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I]] int16x8_t test_vmovl_s8(int8x8_t a) { -// CHECK-LABEL: test_vmovl_s8 return vmovl_s8(a); -// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0 } +// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I]] int32x4_t test_vmovl_s16(int16x4_t a) { -// CHECK-LABEL: test_vmovl_s16 return vmovl_s16(a); -// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0 } +// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I]] int64x2_t test_vmovl_s32(int32x2_t a) { -// CHECK-LABEL: test_vmovl_s32 return vmovl_s32(a); -// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0 } +// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 { +// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I]] uint16x8_t test_vmovl_u8(uint8x8_t a) { -// CHECK-LABEL: test_vmovl_u8 return vmovl_u8(a); -// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0 } +// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I]] uint32x4_t test_vmovl_u16(uint16x4_t a) { -// CHECK-LABEL: test_vmovl_u16 return vmovl_u16(a); -// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0 } +// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I]] uint64x2_t test_vmovl_u32(uint32x2_t a) { -// CHECK-LABEL: test_vmovl_u32 return vmovl_u32(a); -// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0 } +// CHECK-LABEL: define <8 x i16> @test_vmovl_high_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vmovl_high_s8(int8x16_t a) { -// CHECK-LABEL: test_vmovl_high_s8 return vmovl_high_s8(a); -// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0 } +// CHECK-LABEL: define <4 x i32> @test_vmovl_high_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vmovl_high_s16(int16x8_t a) { -// CHECK-LABEL: test_vmovl_high_s16 return vmovl_high_s16(a); -// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0 } +// CHECK-LABEL: define <2 x i64> @test_vmovl_high_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vmovl_high_s32(int32x4_t a) { -// CHECK-LABEL: test_vmovl_high_s32 return vmovl_high_s32(a); -// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0 } +// CHECK-LABEL: define <8 x i16> @test_vmovl_high_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vmovl_high_u8(uint8x16_t a) { -// CHECK-LABEL: test_vmovl_high_u8 return vmovl_high_u8(a); -// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0 } +// CHECK-LABEL: define <4 x i32> @test_vmovl_high_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vmovl_high_u16(uint16x8_t a) { -// CHECK-LABEL: test_vmovl_high_u16 return vmovl_high_u16(a); -// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0 } +// CHECK-LABEL: define <2 x i64> @test_vmovl_high_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vmovl_high_u32(uint32x4_t a) { -// CHECK-LABEL: test_vmovl_high_u32 return vmovl_high_u32(a); -// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0 } +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) +// CHECK: ret <2 x float> [[VCVT_N]]1 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { - // CHECK-LABEL: test_vcvt_n_f32_s32 return vcvt_n_f32_s32(a, 31); - // CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 } +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) +// CHECK: ret <4 x float> [[VCVT_N]]1 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { - // CHECK-LABEL: test_vcvtq_n_f32_s32 return vcvtq_n_f32_s32(a, 31); - // CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 } +// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) +// CHECK: ret <2 x double> [[VCVT_N]]1 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) { - // CHECK-LABEL: test_vcvtq_n_f64_s64 return vcvtq_n_f64_s64(a, 50); - // CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 } +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) +// CHECK: ret <2 x float> [[VCVT_N]]1 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { - // CHECK-LABEL: test_vcvt_n_f32_u32 return vcvt_n_f32_u32(a, 31); - // CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 } +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) +// CHECK: ret <4 x float> [[VCVT_N]]1 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { - // CHECK-LABEL: test_vcvtq_n_f32_u32 return vcvtq_n_f32_u32(a, 31); - // CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 } +// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) +// CHECK: ret <2 x double> [[VCVT_N]]1 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) { - // CHECK-LABEL: test_vcvtq_n_f64_u64 return vcvtq_n_f64_u64(a, 50); - // CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 } +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) +// CHECK: ret <2 x i32> [[VCVT_N]]1 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvt_n_s32_f32 return vcvt_n_s32_f32(a, 31); - // CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 } +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) +// CHECK: ret <4 x i32> [[VCVT_N]]1 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtq_n_s32_f32 return vcvtq_n_s32_f32(a, 31); - // CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 } +// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) +// CHECK: ret <2 x i64> [[VCVT_N]]1 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) { - // CHECK-LABEL: test_vcvtq_n_s64_f64 return vcvtq_n_s64_f64(a, 50); - // CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 } +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) +// CHECK: ret <2 x i32> [[VCVT_N]]1 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvt_n_u32_f32 return vcvt_n_u32_f32(a, 31); - // CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 } +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) +// CHECK: ret <4 x i32> [[VCVT_N]]1 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtq_n_u32_f32 return vcvtq_n_u32_f32(a, 31); - // CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 } +// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) +// CHECK: ret <2 x i64> [[VCVT_N]]1 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) { - // CHECK-LABEL: test_vcvtq_n_u64_f64 return vcvtq_n_u64_f64(a, 50); - // CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 } +// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vaddl_s8 return vaddl_s8(a, b); - // CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vaddl_s16 return vaddl_s16(a, b); - // CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vaddl_s32 return vaddl_s32(a, b); - // CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vaddl_u8 return vaddl_u8(a, b); - // CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vaddl_u16 return vaddl_u16(a, b); - // CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vaddl_u32 return vaddl_u32(a, b); - // CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vaddl_high_s8 return vaddl_high_s8(a, b); - // CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vaddl_high_s16 return vaddl_high_s16(a, b); - // CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vaddl_high_s32 return vaddl_high_s32(a, b); - // CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vaddl_high_u8 return vaddl_high_u8(a, b); - // CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vaddl_high_u16 return vaddl_high_u16(a, b); - // CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vaddl_high_u32 return vaddl_high_u32(a, b); - // CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vaddw_s8 return vaddw_s8(a, b); - // CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vaddw_s16 return vaddw_s16(a, b); - // CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vaddw_s32 return vaddw_s32(a, b); - // CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vaddw_u8 return vaddw_u8(a, b); - // CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vaddw_u16 return vaddw_u16(a, b); - // CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vaddw_u32 return vaddw_u32(a, b); - // CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) { - // CHECK-LABEL: test_vaddw_high_s8 return vaddw_high_s8(a, b); - // CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) { - // CHECK-LABEL: test_vaddw_high_s16 return vaddw_high_s16(a, b); - // CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) { - // CHECK-LABEL: test_vaddw_high_s32 return vaddw_high_s32(a, b); - // CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) { - // CHECK-LABEL: test_vaddw_high_u8 return vaddw_high_u8(a, b); - // CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) { - // CHECK-LABEL: test_vaddw_high_u16 return vaddw_high_u16(a, b); - // CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) { - // CHECK-LABEL: test_vaddw_high_u32 return vaddw_high_u32(a, b); - // CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsubl_s8 return vsubl_s8(a, b); - // CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsubl_s16 return vsubl_s16(a, b); - // CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsubl_s32 return vsubl_s32(a, b); - // CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vsubl_u8 return vsubl_u8(a, b); - // CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vsubl_u16 return vsubl_u16(a, b); - // CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vsubl_u32 return vsubl_u32(a, b); - // CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsubl_high_s8 return vsubl_high_s8(a, b); - // CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsubl_high_s16 return vsubl_high_s16(a, b); - // CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsubl_high_s32 return vsubl_high_s32(a, b); - // CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vsubl_high_u8 return vsubl_high_u8(a, b); - // CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsubl_high_u16 return vsubl_high_u16(a, b); - // CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsubl_high_u32 return vsubl_high_u32(a, b); - // CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsubw_s8 return vsubw_s8(a, b); - // CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsubw_s16 return vsubw_s16(a, b); - // CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsubw_s32 return vsubw_s32(a, b); - // CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vsubw_u8 return vsubw_u8(a, b); - // CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vsubw_u16 return vsubw_u16(a, b); - // CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vsubw_u32 return vsubw_u32(a, b); - // CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) { - // CHECK-LABEL: test_vsubw_high_s8 return vsubw_high_s8(a, b); - // CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) { - // CHECK-LABEL: test_vsubw_high_s16 return vsubw_high_s16(a, b); - // CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) { - // CHECK-LABEL: test_vsubw_high_s32 return vsubw_high_s32(a, b); - // CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) { - // CHECK-LABEL: test_vsubw_high_u8 return vsubw_high_u8(a, b); - // CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsubw_high_u16 return vsubw_high_u16(a, b); - // CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsubw_high_u32 return vsubw_high_u32(a, b); - // CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VADDHN2_I]] int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vaddhn_s16 return vaddhn_s16(a, b); - // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VADDHN2_I]] int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vaddhn_s32 return vaddhn_s32(a, b); - // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VADDHN2_I]] int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vaddhn_s64 return vaddhn_s64(a, b); - // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VADDHN2_I]] uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vaddhn_u16 return vaddhn_u16(a, b); - // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VADDHN2_I]] uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vaddhn_u32 return vaddhn_u32(a, b); - // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VADDHN2_I]] uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vaddhn_u64 return vaddhn_u64(a, b); - // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vaddhn_high_s16 return vaddhn_high_s16(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vaddhn_high_s32 return vaddhn_high_s32(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vaddhn_high_s64 return vaddhn_high_s64(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vaddhn_high_u16 return vaddhn_high_u16(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vaddhn_high_u32 return vaddhn_high_u32(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], +// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vaddhn_high_u64 return vaddhn_high_u64(r, a, b); - // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vraddhn_s16 return vraddhn_s16(a, b); - // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vraddhn_s32 return vraddhn_s32(a, b); - // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vraddhn_s64 return vraddhn_s64(a, b); - // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vraddhn_u16 return vraddhn_u16(a, b); - // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vraddhn_u32 return vraddhn_u32(a, b); - // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vraddhn_u64 return vraddhn_u64(a, b); - // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vraddhn_high_s16 return vraddhn_high_s16(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vraddhn_high_s32 return vraddhn_high_s32(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vraddhn_high_s64 return vraddhn_high_s64(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vraddhn_high_u16 return vraddhn_high_u16(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vraddhn_high_u32 return vraddhn_high_u32(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4 +// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vraddhn_high_u64 return vraddhn_high_u64(r, a, b); - // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSUBHN2_I]] int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsubhn_s16 return vsubhn_s16(a, b); - // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSUBHN2_I]] int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsubhn_s32 return vsubhn_s32(a, b); - // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSUBHN2_I]] int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsubhn_s64 return vsubhn_s64(a, b); - // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSUBHN2_I]] uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsubhn_u16 return vsubhn_u16(a, b); - // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSUBHN2_I]] uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsubhn_u32 return vsubhn_u32(a, b); - // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSUBHN2_I]] uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vsubhn_u64 return vsubhn_u64(a, b); - // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsubhn_high_s16 return vsubhn_high_s16(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsubhn_high_s32 return vsubhn_high_s32(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsubhn_high_s64 return vsubhn_high_s64(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsubhn_high_u16 return vsubhn_high_u16(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsubhn_high_u32 return vsubhn_high_u32(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], +// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vsubhn_high_u64 return vsubhn_high_u64(r, a, b); - // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vrsubhn_s16 return vrsubhn_s16(a, b); - // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vrsubhn_s32 return vrsubhn_s32(a, b); - // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vrsubhn_s64 return vrsubhn_s64(a, b); - // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vrsubhn_u16 return vrsubhn_u16(a, b); - // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vrsubhn_u32 return vrsubhn_u32(a, b); - // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vrsubhn_u64 return vrsubhn_u64(a, b); - // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vrsubhn_high_s16 return vrsubhn_high_s16(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vrsubhn_high_s32 return vrsubhn_high_s32(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vrsubhn_high_s64 return vrsubhn_high_s64(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vrsubhn_high_u16 return vrsubhn_high_u16(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vrsubhn_high_u32 return vrsubhn_high_u32(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4 +// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vrsubhn_high_u64 return vrsubhn_high_u64(r, a, b); - // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I]] int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vabdl_s8 return vabdl_s8(a, b); - // CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I]] int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vabdl_s16 return vabdl_s16(a, b); - // CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I]] int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vabdl_s32 return vabdl_s32(a, b); - // CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I]] uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vabdl_u8 return vabdl_u8(a, b); - // CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I]] uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vabdl_u16 return vabdl_u16(a, b); - // CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I]] uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vabdl_u32 return vabdl_u32(a, b); - // CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { - // CHECK-LABEL: test_vabal_s8 return vabal_s8(a, b, c); - // CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { - // CHECK-LABEL: test_vabal_s16 return vabal_s16(a, b, c); - // CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { - // CHECK-LABEL: test_vabal_s32 return vabal_s32(a, b, c); - // CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { - // CHECK-LABEL: test_vabal_u8 return vabal_u8(a, b, c); - // CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { - // CHECK-LABEL: test_vabal_u16 return vabal_u16(a, b, c); - // CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { - // CHECK-LABEL: test_vabal_u32 return vabal_u32(a, b, c); - // CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vabdl_high_s8 return vabdl_high_s8(a, b); - // CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vabdl_high_s16 return vabdl_high_s16(a, b); - // CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vabdl_high_s32 return vabdl_high_s32(a, b); - // CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vabdl_high_u8 return vabdl_high_u8(a, b); - // CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vabdl_high_u16 return vabdl_high_u16(a, b); - // CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vabdl_high_u32 return vabdl_high_u32(a, b); - // CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I_I]] int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { - // CHECK-LABEL: test_vabal_high_s8 return vabal_high_s8(a, b, c); - // CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { - // CHECK-LABEL: test_vabal_high_s16 return vabal_high_s16(a, b, c); - // CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { - // CHECK-LABEL: test_vabal_high_s32 return vabal_high_s32(a, b, c); - // CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I_I]] uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { - // CHECK-LABEL: test_vabal_high_u8 return vabal_high_u8(a, b, c); - // CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { - // CHECK-LABEL: test_vabal_high_u16 return vabal_high_u16(a, b, c); - // CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { - // CHECK-LABEL: test_vabal_high_u32 return vabal_high_u32(a, b, c); - // CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vmull_s8 return vmull_s8(a, b); - // CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vmull_s16 return vmull_s16(a, b); - // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vmull_s32 return vmull_s32(a, b); - // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vmull_u8 return vmull_u8(a, b); - // CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vmull_u16 return vmull_u16(a, b); - // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vmull_u32 return vmull_u32(a, b); - // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: ret <8 x i16> [[VMULL_I_I]] int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vmull_high_s8 return vmull_high_s8(a, b); - // CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I_I]] int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vmull_high_s16 return vmull_high_s16(a, b); - // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I_I]] int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vmull_high_s32 return vmull_high_s32(a, b); - // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: ret <8 x i16> [[VMULL_I_I]] uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vmull_high_u8 return vmull_high_u8(a, b); - // CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I_I]] uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vmull_high_u16 return vmull_high_u16(a, b); - // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I_I]] uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vmull_high_u32 return vmull_high_u32(a, b); - // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { - // CHECK-LABEL: test_vmlal_s8 return vmlal_s8(a, b, c); - // CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { - // CHECK-LABEL: test_vmlal_s16 return vmlal_s16(a, b, c); - // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { - // CHECK-LABEL: test_vmlal_s32 return vmlal_s32(a, b, c); - // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { - // CHECK-LABEL: test_vmlal_u8 return vmlal_u8(a, b, c); - // CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { - // CHECK-LABEL: test_vmlal_u16 return vmlal_u16(a, b, c); - // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { - // CHECK-LABEL: test_vmlal_u32 return vmlal_u32(a, b, c); - // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I_I]] int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { - // CHECK-LABEL: test_vmlal_high_s8 return vmlal_high_s8(a, b, c); - // CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { - // CHECK-LABEL: test_vmlal_high_s16 return vmlal_high_s16(a, b, c); - // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { - // CHECK-LABEL: test_vmlal_high_s32 return vmlal_high_s32(a, b, c); - // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I_I]] uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { - // CHECK-LABEL: test_vmlal_high_u8 return vmlal_high_u8(a, b, c); - // CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I_I]] uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { - // CHECK-LABEL: test_vmlal_high_u16 return vmlal_high_u16(a, b, c); - // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4 +// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I_I]] uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { - // CHECK-LABEL: test_vmlal_high_u32 return vmlal_high_u32(a, b, c); - // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { - // CHECK-LABEL: test_vmlsl_s8 return vmlsl_s8(a, b, c); - // CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { - // CHECK-LABEL: test_vmlsl_s16 return vmlsl_s16(a, b, c); - // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { - // CHECK-LABEL: test_vmlsl_s32 return vmlsl_s32(a, b, c); - // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { - // CHECK-LABEL: test_vmlsl_u8 return vmlsl_u8(a, b, c); - // CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { - // CHECK-LABEL: test_vmlsl_u16 return vmlsl_u16(a, b, c); - // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { - // CHECK-LABEL: test_vmlsl_u32 return vmlsl_u32(a, b, c); - // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] +// CHECK: ret <8 x i16> [[SUB_I_I]] int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { - // CHECK-LABEL: test_vmlsl_high_s8 return vmlsl_high_s8(a, b, c); - // CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[SUB_I_I]] int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { - // CHECK-LABEL: test_vmlsl_high_s16 return vmlsl_high_s16(a, b, c); - // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[SUB_I_I]] int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { - // CHECK-LABEL: test_vmlsl_high_s32 return vmlsl_high_s32(a, b, c); - // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> +// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] +// CHECK: ret <8 x i16> [[SUB_I_I]] uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { - // CHECK-LABEL: test_vmlsl_high_u8 return vmlsl_high_u8(a, b, c); - // CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] +// CHECK: ret <4 x i32> [[SUB_I_I]] uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { - // CHECK-LABEL: test_vmlsl_high_u16 return vmlsl_high_u16(a, b, c); - // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4 +// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] +// CHECK: ret <2 x i64> [[SUB_I_I]] uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { - // CHECK-LABEL: test_vmlsl_high_u32 return vmlsl_high_u32(a, b, c); - // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vqdmull_s16 return vqdmull_s16(a, b); - // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vqdmull_s32 return vqdmull_s32(a, b); - // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { - // CHECK-LABEL: test_vqdmlal_s16 return vqdmlal_s16(a, b, c); - // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { - // CHECK-LABEL: test_vqdmlal_s32 return vqdmlal_s32(a, b, c); - // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { - // CHECK-LABEL: test_vqdmlsl_s16 return vqdmlsl_s16(a, b, c); - // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { - // CHECK-LABEL: test_vqdmlsl_s32 return vqdmlsl_s32(a, b, c); - // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) #4 +// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqdmull_high_s16 return vqdmull_high_s16(a, b); - // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) #4 +// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqdmull_high_s32 return vqdmull_high_s32(a, b); - // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4 +// CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I_I]] int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { - // CHECK-LABEL: test_vqdmlal_high_s16 return vqdmlal_high_s16(a, b, c); - // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4 +// CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I_I]] int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { - // CHECK-LABEL: test_vqdmlal_high_s32 return vqdmlal_high_s32(a, b, c); - // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4 +// CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I_I]] int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { - // CHECK-LABEL: test_vqdmlsl_high_s16 return vqdmlsl_high_s16(a, b, c); - // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4 +// CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I_I]] int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { - // CHECK-LABEL: test_vqdmlsl_high_s32 return vqdmlsl_high_s32(a, b, c); - // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vmull_p8 return vmull_p8(a, b); - // CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4 +// CHECK: ret <8 x i16> [[VMULL_I_I]] poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vmull_high_p8 return vmull_high_p8(a, b); - // CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i64 @test_vaddd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b +// CHECK: ret i64 [[VADDD_I]] int64_t test_vaddd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vaddd_s64 return vaddd_s64(a, b); -// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define i64 @test_vaddd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b +// CHECK: ret i64 [[VADDD_I]] uint64_t test_vaddd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vaddd_u64 return vaddd_u64(a, b); -// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define i64 @test_vsubd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b +// CHECK: ret i64 [[VSUBD_I]] int64_t test_vsubd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vsubd_s64 return vsubd_s64(a, b); -// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define i64 @test_vsubd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b +// CHECK: ret i64 [[VSUBD_I]] uint64_t test_vsubd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vsubd_u64 return vsubd_u64(a, b); -// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define i8 @test_vqaddb_s8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] int8_t test_vqaddb_s8(int8_t a, int8_t b) { -// CHECK-LABEL: test_vqaddb_s8 return vqaddb_s8(a, b); -// CHECK: sqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } +// CHECK-LABEL: define i16 @test_vqaddh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqaddh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vqaddh_s16 return vqaddh_s16(a, b); -// CHECK: sqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqadds_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQADDS_S32_I]] int32_t test_vqadds_s32(int32_t a, int32_t b) { -// CHECK-LABEL: test_vqadds_s32 return vqadds_s32(a, b); -// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define i64 @test_vqaddd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQADDD_S64_I]] int64_t test_vqaddd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vqaddd_s64 return vqaddd_s64(a, b); -// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i8 @test_vqaddb_u8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) { -// CHECK-LABEL: test_vqaddb_u8 return vqaddb_u8(a, b); -// CHECK: uqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } +// CHECK-LABEL: define i16 @test_vqaddh_u16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) { -// CHECK-LABEL: test_vqaddh_u16 return vqaddh_u16(a, b); -// CHECK: uqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqadds_u32(i32 %a, i32 %b) #0 { +// CHECK: [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQADDS_U32_I]] uint32_t test_vqadds_u32(uint32_t a, uint32_t b) { -// CHECK-LABEL: test_vqadds_u32 return vqadds_u32(a, b); -// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define i64 @test_vqaddd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQADDD_U64_I]] uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vqaddd_u64 return vqaddd_u64(a, b); -// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i8 @test_vqsubb_s8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] int8_t test_vqsubb_s8(int8_t a, int8_t b) { -// CHECK-LABEL: test_vqsubb_s8 return vqsubb_s8(a, b); -// CHECK: sqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } +// CHECK-LABEL: define i16 @test_vqsubh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqsubh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vqsubh_s16 return vqsubh_s16(a, b); -// CHECK: sqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqsubs_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQSUBS_S32_I]] int32_t test_vqsubs_s32(int32_t a, int32_t b) { - // CHECK-LABEL: test_vqsubs_s32 return vqsubs_s32(a, b); -// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define i64 @test_vqsubd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQSUBD_S64_I]] int64_t test_vqsubd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vqsubd_s64 return vqsubd_s64(a, b); -// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i8 @test_vqsubb_u8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) { -// CHECK-LABEL: test_vqsubb_u8 return vqsubb_u8(a, b); -// CHECK: uqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } +// CHECK-LABEL: define i16 @test_vqsubh_u16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) { -// CHECK-LABEL: test_vqsubh_u16 return vqsubh_u16(a, b); -// CHECK: uqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqsubs_u32(i32 %a, i32 %b) #0 { +// CHECK: [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQSUBS_U32_I]] uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) { -// CHECK-LABEL: test_vqsubs_u32 return vqsubs_u32(a, b); -// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define i64 @test_vqsubd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQSUBD_U64_I]] uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vqsubd_u64 return vqsubd_u64(a, b); -// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i64 @test_vshld_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VSHLD_S64_I]] int64_t test_vshld_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vshld_s64 return vshld_s64(a, b); -// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i64 @test_vshld_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VSHLD_U64_I]] uint64_t test_vshld_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vshld_u64 return vshld_u64(a, b); -// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vqshlb_s8 +// CHECK-LABEL: define i8 @test_vqshlb_s8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] int8_t test_vqshlb_s8(int8_t a, int8_t b) { return vqshlb_s8(a, b); -// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } -// CHECK-LABEL: test_vqshlh_s16 +// CHECK-LABEL: define i16 @test_vqshlh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqshlh_s16(int16_t a, int16_t b) { return vqshlh_s16(a, b); -// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } -// CHECK-LABEL: test_vqshls_s32 +// CHECK-LABEL: define i32 @test_vqshls_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQSHLS_S32_I]] int32_t test_vqshls_s32(int32_t a, int32_t b) { return vqshls_s32(a, b); -// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } -// CHECK-LABEL: test_vqshld_s64 +// CHECK-LABEL: define i64 @test_vqshld_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQSHLD_S64_I]] int64_t test_vqshld_s64(int64_t a, int64_t b) { return vqshld_s64(a, b); -// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vqshlb_u8 +// CHECK-LABEL: define i8 @test_vqshlb_u8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) { return vqshlb_u8(a, b); -// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } -// CHECK-LABEL: test_vqshlh_u16 +// CHECK-LABEL: define i16 @test_vqshlh_u16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) { return vqshlh_u16(a, b); -// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } -// CHECK-LABEL: test_vqshls_u32 +// CHECK-LABEL: define i32 @test_vqshls_u32(i32 %a, i32 %b) #0 { +// CHECK: [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQSHLS_U32_I]] uint32_t test_vqshls_u32(uint32_t a, uint32_t b) { return vqshls_u32(a, b); -// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } -// CHECK-LABEL: test_vqshld_u64 +// CHECK-LABEL: define i64 @test_vqshld_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQSHLD_U64_I]] uint64_t test_vqshld_u64(uint64_t a, uint64_t b) { return vqshld_u64(a, b); -// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vrshld_s64 +// CHECK-LABEL: define i64 @test_vrshld_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VRSHLD_S64_I]] int64_t test_vrshld_s64(int64_t a, int64_t b) { return vrshld_s64(a, b); -// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vrshld_u64 +// CHECK-LABEL: define i64 @test_vrshld_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VRSHLD_U64_I]] uint64_t test_vrshld_u64(uint64_t a, uint64_t b) { return vrshld_u64(a, b); -// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vqrshlb_s8 +// CHECK-LABEL: define i8 @test_vqrshlb_s8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] int8_t test_vqrshlb_s8(int8_t a, int8_t b) { return vqrshlb_s8(a, b); -// CHECK: sqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } -// CHECK-LABEL: test_vqrshlh_s16 +// CHECK-LABEL: define i16 @test_vqrshlh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqrshlh_s16(int16_t a, int16_t b) { return vqrshlh_s16(a, b); -// CHECK: sqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } -// CHECK-LABEL: test_vqrshls_s32 +// CHECK-LABEL: define i32 @test_vqrshls_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQRSHLS_S32_I]] int32_t test_vqrshls_s32(int32_t a, int32_t b) { return vqrshls_s32(a, b); -// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } -// CHECK-LABEL: test_vqrshld_s64 +// CHECK-LABEL: define i64 @test_vqrshld_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQRSHLD_S64_I]] int64_t test_vqrshld_s64(int64_t a, int64_t b) { return vqrshld_s64(a, b); -// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vqrshlb_u8 +// CHECK-LABEL: define i8 @test_vqrshlb_u8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) { return vqrshlb_u8(a, b); -// CHECK: uqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} } -// CHECK-LABEL: test_vqrshlh_u16 +// CHECK-LABEL: define i16 @test_vqrshlh_u16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) { return vqrshlh_u16(a, b); -// CHECK: uqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } -// CHECK-LABEL: test_vqrshls_u32 +// CHECK-LABEL: define i32 @test_vqrshls_u32(i32 %a, i32 %b) #0 { +// CHECK: [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQRSHLS_U32_I]] uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) { return vqrshls_u32(a, b); -// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } -// CHECK-LABEL: test_vqrshld_u64 +// CHECK-LABEL: define i64 @test_vqrshld_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VQRSHLD_U64_I]] uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) { return vqrshld_u64(a, b); -// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vpaddd_s64 +// CHECK-LABEL: define i64 @test_vpaddd_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4 +// CHECK: ret i64 [[VPADDD_S64_I]] int64_t test_vpaddd_s64(int64x2_t a) { return vpaddd_s64(a); -// CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vpadds_f32 +// CHECK-LABEL: define float @test_vpadds_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE0_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 0 +// CHECK: [[LANE1_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 1 +// CHECK: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] +// CHECK: ret float [[VPADDD_I]] float32_t test_vpadds_f32(float32x2_t a) { return vpadds_f32(a); -// CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s } -// CHECK-LABEL: test_vpaddd_f64 +// CHECK-LABEL: define double @test_vpaddd_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[LANE0_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 0 +// CHECK: [[LANE1_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 +// CHECK: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] +// CHECK: ret double [[VPADDD_I]] float64_t test_vpaddd_f64(float64x2_t a) { return vpaddd_f64(a); -// CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vpmaxnms_f32 +// CHECK-LABEL: define float @test_vpmaxnms_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VPMAXNMS_F32_I]] float32_t test_vpmaxnms_f32(float32x2_t a) { return vpmaxnms_f32(a); -// CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s } -// CHECK-LABEL: test_vpmaxnmqd_f64 +// CHECK-LABEL: define double @test_vpmaxnmqd_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VPMAXNMQD_F64_I]] float64_t test_vpmaxnmqd_f64(float64x2_t a) { return vpmaxnmqd_f64(a); -// CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vpmaxs_f32 +// CHECK-LABEL: define float @test_vpmaxs_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VPMAXS_F32_I]] float32_t test_vpmaxs_f32(float32x2_t a) { return vpmaxs_f32(a); -// CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s } -// CHECK-LABEL: test_vpmaxqd_f64 +// CHECK-LABEL: define double @test_vpmaxqd_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VPMAXQD_F64_I]] float64_t test_vpmaxqd_f64(float64x2_t a) { return vpmaxqd_f64(a); -// CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vpminnms_f32 +// CHECK-LABEL: define float @test_vpminnms_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VPMINNMS_F32_I]] float32_t test_vpminnms_f32(float32x2_t a) { return vpminnms_f32(a); -// CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s } -// CHECK-LABEL: test_vpminnmqd_f64 +// CHECK-LABEL: define double @test_vpminnmqd_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VPMINNMQD_F64_I]] float64_t test_vpminnmqd_f64(float64x2_t a) { return vpminnmqd_f64(a); -// CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vpmins_f32 +// CHECK-LABEL: define float @test_vpmins_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VPMINS_F32_I]] float32_t test_vpmins_f32(float32x2_t a) { return vpmins_f32(a); -// CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s } -// CHECK-LABEL: test_vpminqd_f64 +// CHECK-LABEL: define double @test_vpminqd_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VPMINQD_F64_I]] float64_t test_vpminqd_f64(float64x2_t a) { return vpminqd_f64(a); -// CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqdmulhh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vqdmulhh_s16 return vqdmulhh_s16(a, b); -// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQDMULHS_S32_I]] int32_t test_vqdmulhs_s32(int32_t a, int32_t b) { -// CHECK-LABEL: test_vqdmulhs_s32 return vqdmulhs_s32(a, b); -// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vqrdmulhh_s16 return vqrdmulhh_s16(a, b); -// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} } +// CHECK-LABEL: define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VQRDMULHS_S32_I]] int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) { -// CHECK-LABEL: test_vqrdmulhs_s32 return vqrdmulhs_s32(a, b); -// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define float @test_vmulxs_f32(float %a, float %b) #0 { +// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4 +// CHECK: ret float [[VMULXS_F32_I]] float32_t test_vmulxs_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vmulxs_f32 return vmulxs_f32(a, b); -// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define double @test_vmulxd_f64(double %a, double %b) #0 { +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4 +// CHECK: ret double [[VMULXD_F64_I]] float64_t test_vmulxd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vmulxd_f64 return vmulxd_f64(a, b); -// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmulx_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) #4 +// CHECK: ret <1 x double> [[VMULX2_I]] float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) { -// CHECK-LABEL: test_vmulx_f64 return vmulx_f64(a, b); -// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define float @test_vrecpss_f32(float %a, float %b) #0 { +// CHECK: [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4 +// CHECK: ret float [[VRECPS_I]] float32_t test_vrecpss_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vrecpss_f32 return vrecpss_f32(a, b); -// CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define double @test_vrecpsd_f64(double %a, double %b) #0 { +// CHECK: [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4 +// CHECK: ret double [[VRECPS_I]] float64_t test_vrecpsd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vrecpsd_f64 return vrecpsd_f64(a, b); -// CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define float @test_vrsqrtss_f32(float %a, float %b) #0 { +// CHECK: [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4 +// CHECK: ret float [[VRSQRTSS_F32_I]] float32_t test_vrsqrtss_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vrsqrtss_f32 return vrsqrtss_f32(a, b); -// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} } +// CHECK-LABEL: define double @test_vrsqrtsd_f64(double %a, double %b) #0 { +// CHECK: [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4 +// CHECK: ret double [[VRSQRTSD_F64_I]] float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vrsqrtsd_f64 return vrsqrtsd_f64(a, b); -// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define float @test_vcvts_f32_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = sitofp i32 %a to float +// CHECK: ret float [[TMP0]] float32_t test_vcvts_f32_s32(int32_t a) { -// CHECK-LABEL: test_vcvts_f32_s32 -// CHECK: scvtf {{s[0-9]+}}, {{[ws][0-9]+}} return vcvts_f32_s32(a); } +// CHECK-LABEL: define double @test_vcvtd_f64_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = sitofp i64 %a to double +// CHECK: ret double [[TMP0]] float64_t test_vcvtd_f64_s64(int64_t a) { -// CHECK-LABEL: test_vcvtd_f64_s64 -// CHECK: scvtf {{d[0-9]+}}, {{[dx][0-9]+}} return vcvtd_f64_s64(a); } +// CHECK-LABEL: define float @test_vcvts_f32_u32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = uitofp i32 %a to float +// CHECK: ret float [[TMP0]] float32_t test_vcvts_f32_u32(uint32_t a) { -// CHECK-LABEL: test_vcvts_f32_u32 -// CHECK: ucvtf {{s[0-9]+}}, {{[ws][0-9]+}} return vcvts_f32_u32(a); } +// CHECK-LABEL: define double @test_vcvtd_f64_u64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = uitofp i64 %a to double +// CHECK: ret double [[TMP0]] float64_t test_vcvtd_f64_u64(uint64_t a) { -// CHECK-LABEL: test_vcvtd_f64_u64 -// CHECK: ucvtf {{d[0-9]+}}, {{[xd][0-9]+}} return vcvtd_f64_u64(a); } +// CHECK-LABEL: define float @test_vrecpes_f32(float %a) #0 { +// CHECK: [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4 +// CHECK: ret float [[VRECPES_F32_I]] float32_t test_vrecpes_f32(float32_t a) { -// CHECK-LABEL: test_vrecpes_f32 -// CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}} return vrecpes_f32(a); } +// CHECK-LABEL: define double @test_vrecped_f64(double %a) #0 { +// CHECK: [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4 +// CHECK: ret double [[VRECPED_F64_I]] float64_t test_vrecped_f64(float64_t a) { -// CHECK-LABEL: test_vrecped_f64 -// CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}} return vrecped_f64(a); } +// CHECK-LABEL: define float @test_vrecpxs_f32(float %a) #0 { +// CHECK: [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4 +// CHECK: ret float [[VRECPXS_F32_I]] float32_t test_vrecpxs_f32(float32_t a) { -// CHECK-LABEL: test_vrecpxs_f32 -// CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}} return vrecpxs_f32(a); } +// CHECK-LABEL: define double @test_vrecpxd_f64(double %a) #0 { +// CHECK: [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4 +// CHECK: ret double [[VRECPXD_F64_I]] float64_t test_vrecpxd_f64(float64_t a) { -// CHECK-LABEL: test_vrecpxd_f64 -// CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}} return vrecpxd_f64(a); } +// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4 +// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] uint32x2_t test_vrsqrte_u32(uint32x2_t a) { -// CHECK-LABEL: test_vrsqrte_u32 -// CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s return vrsqrte_u32(a); } +// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4 +// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { -// CHECK-LABEL: test_vrsqrteq_u32 -// CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s return vrsqrteq_u32(a); } +// CHECK-LABEL: define float @test_vrsqrtes_f32(float %a) #0 { +// CHECK: [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4 +// CHECK: ret float [[VRSQRTES_F32_I]] float32_t test_vrsqrtes_f32(float32_t a) { -// CHECK: vrsqrtes_f32 -// CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}} return vrsqrtes_f32(a); } +// CHECK-LABEL: define double @test_vrsqrted_f64(double %a) #0 { +// CHECK: [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4 +// CHECK: ret double [[VRSQRTED_F64_I]] float64_t test_vrsqrted_f64(float64_t a) { -// CHECK: vrsqrted_f64 -// CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}} return vrsqrted_f64(a); } +// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] uint8x16_t test_vld1q_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld1q_u8 return vld1q_u8(a); - // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vld1q_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld1q_u16 return vld1q_u16(a); - // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]] +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vld1q_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld1q_u32 return vld1q_u32(a); - // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]] +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vld1q_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld1q_u64 return vld1q_u64(a); - // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] int8x16_t test_vld1q_s8(int8_t const *a) { - // CHECK-LABEL: test_vld1q_s8 return vld1q_s8(a); - // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vld1q_s16(int16_t const *a) { - // CHECK-LABEL: test_vld1q_s16 return vld1q_s16(a); - // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]] +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vld1q_s32(int32_t const *a) { - // CHECK-LABEL: test_vld1q_s32 return vld1q_s32(a); - // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]] +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vld1q_s64(int64_t const *a) { - // CHECK-LABEL: test_vld1q_s64 return vld1q_s64(a); - // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK: ret <8 x half> [[TMP3]] float16x8_t test_vld1q_f16(float16_t const *a) { - // CHECK-LABEL: test_vld1q_f16 return vld1q_f16(a); - // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]] +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vld1q_f32(float32_t const *a) { - // CHECK-LABEL: test_vld1q_f32 return vld1q_f32(a); - // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x double> @test_vld1q_f64(double* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>* +// CHECK: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]] +// CHECK: ret <2 x double> [[TMP2]] float64x2_t test_vld1q_f64(float64_t const *a) { - // CHECK-LABEL: test_vld1q_f64 return vld1q_f64(a); - // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] poly8x16_t test_vld1q_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld1q_p8 return vld1q_p8(a); - // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] poly16x8_t test_vld1q_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld1q_p16 return vld1q_p16(a); - // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] uint8x8_t test_vld1_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld1_u8 return vld1_u8(a); - // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vld1_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld1_u16 return vld1_u16(a); - // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]] +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vld1_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld1_u32 return vld1_u32(a); - // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]] +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vld1_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld1_u64 return vld1_u64(a); - // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] int8x8_t test_vld1_s8(int8_t const *a) { - // CHECK-LABEL: test_vld1_s8 return vld1_s8(a); - // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vld1_s16(int16_t const *a) { - // CHECK-LABEL: test_vld1_s16 return vld1_s16(a); - // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]] +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vld1_s32(int32_t const *a) { - // CHECK-LABEL: test_vld1_s32 return vld1_s32(a); - // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]] +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vld1_s64(int64_t const *a) { - // CHECK-LABEL: test_vld1_s64 return vld1_s64(a); - // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK: ret <4 x half> [[TMP3]] float16x4_t test_vld1_f16(float16_t const *a) { - // CHECK-LABEL: test_vld1_f16 return vld1_f16(a); - // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]] +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vld1_f32(float32_t const *a) { - // CHECK-LABEL: test_vld1_f32 return vld1_f32(a); - // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x double> @test_vld1_f64(double* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>* +// CHECK: [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]] +// CHECK: ret <1 x double> [[TMP2]] float64x1_t test_vld1_f64(float64_t const *a) { - // CHECK-LABEL: test_vld1_f64 return vld1_f64(a); - // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] poly8x8_t test_vld1_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld1_p8 return vld1_p8(a); - // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] poly16x4_t test_vld1_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld1_p16 return vld1_p16(a); - // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP5]] uint8x16x2_t test_vld2q_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld2q_u8 return vld2q_u8(a); - // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP6]] uint16x8x2_t test_vld2q_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld2q_u16 return vld2q_u16(a); - // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP6]] uint32x4x2_t test_vld2q_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld2q_u32 return vld2q_u32(a); - // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x2_t [[TMP6]] uint64x2x2_t test_vld2q_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld2q_u64 return vld2q_u64(a); - // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP5]] int8x16x2_t test_vld2q_s8(int8_t const *a) { - // CHECK-LABEL: test_vld2q_s8 return vld2q_s8(a); - // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP6]] int16x8x2_t test_vld2q_s16(int16_t const *a) { - // CHECK-LABEL: test_vld2q_s16 return vld2q_s16(a); - // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP6]] int32x4x2_t test_vld2q_s32(int32_t const *a) { - // CHECK-LABEL: test_vld2q_s32 return vld2q_s32(a); - // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x2_t [[TMP6]] int64x2x2_t test_vld2q_s64(int64_t const *a) { - // CHECK-LABEL: test_vld2q_s64 return vld2q_s64(a); - // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP6]] float16x8x2_t test_vld2q_f16(float16_t const *a) { - // CHECK-LABEL: test_vld2q_f16 return vld2q_f16(a); - // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP6]] float32x4x2_t test_vld2q_f32(float32_t const *a) { - // CHECK-LABEL: test_vld2q_f32 return vld2q_f32(a); - // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>* +// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0v2f64(<2 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x2_t [[TMP6]] float64x2x2_t test_vld2q_f64(float64_t const *a) { - // CHECK-LABEL: test_vld2q_f64 return vld2q_f64(a); - // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP5]] poly8x16x2_t test_vld2q_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld2q_p8 return vld2q_p8(a); - // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP6]] poly16x8x2_t test_vld2q_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld2q_p16 return vld2q_p16(a); - // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP5]] uint8x8x2_t test_vld2_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld2_u8 return vld2_u8(a); - // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP6]] uint16x4x2_t test_vld2_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld2_u16 return vld2_u16(a); - // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP6]] uint32x2x2_t test_vld2_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld2_u32 return vld2_u32(a); - // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP6]] uint64x1x2_t test_vld2_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld2_u64 return vld2_u64(a); - // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP5]] int8x8x2_t test_vld2_s8(int8_t const *a) { - // CHECK-LABEL: test_vld2_s8 return vld2_s8(a); - // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP6]] int16x4x2_t test_vld2_s16(int16_t const *a) { - // CHECK-LABEL: test_vld2_s16 return vld2_s16(a); - // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP6]] int32x2x2_t test_vld2_s32(int32_t const *a) { - // CHECK-LABEL: test_vld2_s32 return vld2_s32(a); - // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP6]] int64x1x2_t test_vld2_s64(int64_t const *a) { - // CHECK-LABEL: test_vld2_s64 return vld2_s64(a); - // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP6]] float16x4x2_t test_vld2_f16(float16_t const *a) { - // CHECK-LABEL: test_vld2_f16 return vld2_f16(a); - // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP6]] float32x2x2_t test_vld2_f32(float32_t const *a) { - // CHECK-LABEL: test_vld2_f32 return vld2_f32(a); - // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>* +// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0v1f64(<1 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x2_t [[TMP6]] float64x1x2_t test_vld2_f64(float64_t const *a) { - // CHECK-LABEL: test_vld2_f64 return vld2_f64(a); - // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP5]] poly8x8x2_t test_vld2_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld2_p8 return vld2_p8(a); - // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP6]] poly16x4x2_t test_vld2_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld2_p16 return vld2_p16(a); - // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x3_t [[TMP5]] uint8x16x3_t test_vld3q_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld3q_u8 return vld3q_u8(a); - // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP6]] uint16x8x3_t test_vld3q_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld3q_u16 return vld3q_u16(a); - // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP6]] uint32x4x3_t test_vld3q_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld3q_u32 return vld3q_u32(a); - // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x3_t [[TMP6]] uint64x2x3_t test_vld3q_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld3q_u64 return vld3q_u64(a); - // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x3_t [[TMP5]] int8x16x3_t test_vld3q_s8(int8_t const *a) { - // CHECK-LABEL: test_vld3q_s8 return vld3q_s8(a); - // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP6]] int16x8x3_t test_vld3q_s16(int16_t const *a) { - // CHECK-LABEL: test_vld3q_s16 return vld3q_s16(a); - // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP6]] int32x4x3_t test_vld3q_s32(int32_t const *a) { - // CHECK-LABEL: test_vld3q_s32 return vld3q_s32(a); - // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x3_t [[TMP6]] int64x2x3_t test_vld3q_s64(int64_t const *a) { - // CHECK-LABEL: test_vld3q_s64 return vld3q_s64(a); - // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP6]] float16x8x3_t test_vld3q_f16(float16_t const *a) { - // CHECK-LABEL: test_vld3q_f16 return vld3q_f16(a); - // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP6]] float32x4x3_t test_vld3q_f32(float32_t const *a) { - // CHECK-LABEL: test_vld3q_f32 return vld3q_f32(a); - // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>* +// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0v2f64(<2 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x3_t [[TMP6]] float64x2x3_t test_vld3q_f64(float64_t const *a) { - // CHECK-LABEL: test_vld3q_f64 return vld3q_f64(a); - // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x3_t [[TMP5]] poly8x16x3_t test_vld3q_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld3q_p8 return vld3q_p8(a); - // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP6]] poly16x8x3_t test_vld3q_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld3q_p16 return vld3q_p16(a); - // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP5]] uint8x8x3_t test_vld3_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld3_u8 return vld3_u8(a); - // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP6]] uint16x4x3_t test_vld3_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld3_u16 return vld3_u16(a); - // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP6]] uint32x2x3_t test_vld3_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld3_u32 return vld3_u32(a); - // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP6]] uint64x1x3_t test_vld3_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld3_u64 return vld3_u64(a); - // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP5]] int8x8x3_t test_vld3_s8(int8_t const *a) { - // CHECK-LABEL: test_vld3_s8 return vld3_s8(a); - // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP6]] int16x4x3_t test_vld3_s16(int16_t const *a) { - // CHECK-LABEL: test_vld3_s16 return vld3_s16(a); - // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP6]] int32x2x3_t test_vld3_s32(int32_t const *a) { - // CHECK-LABEL: test_vld3_s32 return vld3_s32(a); - // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP6]] int64x1x3_t test_vld3_s64(int64_t const *a) { - // CHECK-LABEL: test_vld3_s64 return vld3_s64(a); - // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP6]] float16x4x3_t test_vld3_f16(float16_t const *a) { - // CHECK-LABEL: test_vld3_f16 return vld3_f16(a); - // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP6]] float32x2x3_t test_vld3_f32(float32_t const *a) { - // CHECK-LABEL: test_vld3_f32 return vld3_f32(a); - // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>* +// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0v1f64(<1 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x3_t [[TMP6]] float64x1x3_t test_vld3_f64(float64_t const *a) { - // CHECK-LABEL: test_vld3_f64 return vld3_f64(a); - // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP5]] poly8x8x3_t test_vld3_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld3_p8 return vld3_p8(a); - // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP6]] poly16x4x3_t test_vld3_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld3_p16 return vld3_p16(a); - // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x4_t [[TMP5]] uint8x16x4_t test_vld4q_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld4q_u8 return vld4q_u8(a); - // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP6]] uint16x8x4_t test_vld4q_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld4q_u16 return vld4q_u16(a); - // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP6]] uint32x4x4_t test_vld4q_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld4q_u32 return vld4q_u32(a); - // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x4_t [[TMP6]] uint64x2x4_t test_vld4q_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld4q_u64 return vld4q_u64(a); - // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x4_t [[TMP5]] int8x16x4_t test_vld4q_s8(int8_t const *a) { - // CHECK-LABEL: test_vld4q_s8 return vld4q_s8(a); - // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP6]] int16x8x4_t test_vld4q_s16(int16_t const *a) { - // CHECK-LABEL: test_vld4q_s16 return vld4q_s16(a); - // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP6]] int32x4x4_t test_vld4q_s32(int32_t const *a) { - // CHECK-LABEL: test_vld4q_s32 return vld4q_s32(a); - // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x4_t [[TMP6]] int64x2x4_t test_vld4q_s64(int64_t const *a) { - // CHECK-LABEL: test_vld4q_s64 return vld4q_s64(a); - // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP6]] float16x8x4_t test_vld4q_f16(float16_t const *a) { - // CHECK-LABEL: test_vld4q_f16 return vld4q_f16(a); - // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP6]] float32x4x4_t test_vld4q_f32(float32_t const *a) { - // CHECK-LABEL: test_vld4q_f32 return vld4q_f32(a); - // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>* +// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0v2f64(<2 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x4_t [[TMP6]] float64x2x4_t test_vld4q_f64(float64_t const *a) { - // CHECK-LABEL: test_vld4q_f64 return vld4q_f64(a); - // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x4_t [[TMP5]] poly8x16x4_t test_vld4q_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld4q_p8 return vld4q_p8(a); - // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP6]] poly16x8x4_t test_vld4q_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld4q_p16 return vld4q_p16(a); - // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP5]] uint8x8x4_t test_vld4_u8(uint8_t const *a) { - // CHECK-LABEL: test_vld4_u8 return vld4_u8(a); - // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP6]] uint16x4x4_t test_vld4_u16(uint16_t const *a) { - // CHECK-LABEL: test_vld4_u16 return vld4_u16(a); - // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP6]] uint32x2x4_t test_vld4_u32(uint32_t const *a) { - // CHECK-LABEL: test_vld4_u32 return vld4_u32(a); - // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP6]] uint64x1x4_t test_vld4_u64(uint64_t const *a) { - // CHECK-LABEL: test_vld4_u64 return vld4_u64(a); - // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP5]] int8x8x4_t test_vld4_s8(int8_t const *a) { - // CHECK-LABEL: test_vld4_s8 return vld4_s8(a); - // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP6]] int16x4x4_t test_vld4_s16(int16_t const *a) { - // CHECK-LABEL: test_vld4_s16 return vld4_s16(a); - // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP6]] int32x2x4_t test_vld4_s32(int32_t const *a) { - // CHECK-LABEL: test_vld4_s32 return vld4_s32(a); - // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP6]] int64x1x4_t test_vld4_s64(int64_t const *a) { - // CHECK-LABEL: test_vld4_s64 return vld4_s64(a); - // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP6]] float16x4x4_t test_vld4_f16(float16_t const *a) { - // CHECK-LABEL: test_vld4_f16 return vld4_f16(a); - // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP6]] float32x2x4_t test_vld4_f32(float32_t const *a) { - // CHECK-LABEL: test_vld4_f32 return vld4_f32(a); - // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>* +// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0v1f64(<1 x double>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x4_t [[TMP6]] float64x1x4_t test_vld4_f64(float64_t const *a) { - // CHECK-LABEL: test_vld4_f64 return vld4_f64(a); - // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP5]] poly8x8x4_t test_vld4_p8(poly8_t const *a) { - // CHECK-LABEL: test_vld4_p8 return vld4_p8(a); - // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP6]] poly16x4x4_t test_vld4_p16(poly16_t const *a) { - // CHECK-LABEL: test_vld4_p16 return vld4_p16(a); - // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_u8(uint8_t *a, uint8x16_t b) { - // CHECK-LABEL: test_vst1q_u8 vst1q_u8(a, b); - // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_u16(uint16_t *a, uint16x8_t b) { - // CHECK-LABEL: test_vst1q_u16 vst1q_u16(a, b); - // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1q_u32(uint32_t *a, uint32x4_t b) { - // CHECK-LABEL: test_vst1q_u32 vst1q_u32(a, b); - // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1q_u64(uint64_t *a, uint64x2_t b) { - // CHECK-LABEL: test_vst1q_u64 vst1q_u64(a, b); - // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_s8(int8_t *a, int8x16_t b) { - // CHECK-LABEL: test_vst1q_s8 vst1q_s8(a, b); - // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_s16(int16_t *a, int16x8_t b) { - // CHECK-LABEL: test_vst1q_s16 vst1q_s16(a, b); - // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1q_s32(int32_t *a, int32x4_t b) { - // CHECK-LABEL: test_vst1q_s32 vst1q_s32(a, b); - // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1q_s64(int64_t *a, int64x2_t b) { - // CHECK-LABEL: test_vst1q_s64 vst1q_s64(a, b); - // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_f16(float16_t *a, float16x8_t b) { - // CHECK-LABEL: test_vst1q_f16 vst1q_f16(a, b); - // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: store <4 x float> [[TMP3]], <4 x float>* [[TMP2]] +// CHECK: ret void void test_vst1q_f32(float32_t *a, float32x4_t b) { - // CHECK-LABEL: test_vst1q_f32 vst1q_f32(a, b); - // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f64(double* %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: store <2 x double> [[TMP3]], <2 x double>* [[TMP2]] +// CHECK: ret void void test_vst1q_f64(float64_t *a, float64x2_t b) { - // CHECK-LABEL: test_vst1q_f64 vst1q_f64(a, b); - // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_p8(poly8_t *a, poly8x16_t b) { - // CHECK-LABEL: test_vst1q_p8 vst1q_p8(a, b); - // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_p16(poly16_t *a, poly16x8_t b) { - // CHECK-LABEL: test_vst1q_p16 vst1q_p16(a, b); - // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_u8(uint8_t *a, uint8x8_t b) { - // CHECK-LABEL: test_vst1_u8 vst1_u8(a, b); - // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_u16(uint16_t *a, uint16x4_t b) { - // CHECK-LABEL: test_vst1_u16 vst1_u16(a, b); - // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1_u32(uint32_t *a, uint32x2_t b) { - // CHECK-LABEL: test_vst1_u32 vst1_u32(a, b); - // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1_u64(uint64_t *a, uint64x1_t b) { - // CHECK-LABEL: test_vst1_u64 vst1_u64(a, b); - // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_s8(int8_t *a, int8x8_t b) { - // CHECK-LABEL: test_vst1_s8 vst1_s8(a, b); - // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_s16(int16_t *a, int16x4_t b) { - // CHECK-LABEL: test_vst1_s16 vst1_s16(a, b); - // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1_s32(int32_t *a, int32x2_t b) { - // CHECK-LABEL: test_vst1_s32 vst1_s32(a, b); - // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1_s64(int64_t *a, int64x1_t b) { - // CHECK-LABEL: test_vst1_s64 vst1_s64(a, b); - // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_f16(float16_t *a, float16x4_t b) { - // CHECK-LABEL: test_vst1_f16 vst1_f16(a, b); - // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: store <2 x float> [[TMP3]], <2 x float>* [[TMP2]] +// CHECK: ret void void test_vst1_f32(float32_t *a, float32x2_t b) { - // CHECK-LABEL: test_vst1_f32 vst1_f32(a, b); - // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f64(double* %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: store <1 x double> [[TMP3]], <1 x double>* [[TMP2]] +// CHECK: ret void void test_vst1_f64(float64_t *a, float64x1_t b) { - // CHECK-LABEL: test_vst1_f64 vst1_f64(a, b); - // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_p8(poly8_t *a, poly8x8_t b) { - // CHECK-LABEL: test_vst1_p8 vst1_p8(a, b); - // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_p16(poly16_t *a, poly16x4_t b) { - // CHECK-LABEL: test_vst1_p16 vst1_p16(a, b); - // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) { - // CHECK-LABEL: test_vst2q_u8 vst2q_u8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) { - // CHECK-LABEL: test_vst2q_u16 vst2q_u16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) { - // CHECK-LABEL: test_vst2q_u32 vst2q_u32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) { - // CHECK-LABEL: test_vst2q_u64 vst2q_u64(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_s8(int8_t *a, int8x16x2_t b) { - // CHECK-LABEL: test_vst2q_s8 vst2q_s8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_s16(int16_t *a, int16x8x2_t b) { - // CHECK-LABEL: test_vst2q_s16 vst2q_s16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_s32(int32_t *a, int32x4x2_t b) { - // CHECK-LABEL: test_vst2q_s32 vst2q_s32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_s64(int64_t *a, int64x2x2_t b) { - // CHECK-LABEL: test_vst2q_s64 vst2q_s64(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_f16(float16_t *a, float16x8x2_t b) { - // CHECK-LABEL: test_vst2q_f16 vst2q_f16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_f32(float32_t *a, float32x4x2_t b) { - // CHECK-LABEL: test_vst2q_f32 vst2q_f32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st2.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_f64(float64_t *a, float64x2x2_t b) { - // CHECK-LABEL: test_vst2q_f64 vst2q_f64(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) { - // CHECK-LABEL: test_vst2q_p8 vst2q_p8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) { - // CHECK-LABEL: test_vst2q_p16 vst2q_p16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_u8(uint8_t *a, uint8x8x2_t b) { - // CHECK-LABEL: test_vst2_u8 vst2_u8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u16(uint16_t *a, uint16x4x2_t b) { - // CHECK-LABEL: test_vst2_u16 vst2_u16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u32(uint32_t *a, uint32x2x2_t b) { - // CHECK-LABEL: test_vst2_u32 vst2_u32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u64(uint64_t *a, uint64x1x2_t b) { - // CHECK-LABEL: test_vst2_u64 vst2_u64(a, b); - // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_s8(int8_t *a, int8x8x2_t b) { - // CHECK-LABEL: test_vst2_s8 vst2_s8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s16(int16_t *a, int16x4x2_t b) { - // CHECK-LABEL: test_vst2_s16 vst2_s16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s32(int32_t *a, int32x2x2_t b) { - // CHECK-LABEL: test_vst2_s32 vst2_s32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s64(int64_t *a, int64x1x2_t b) { - // CHECK-LABEL: test_vst2_s64 vst2_s64(a, b); - // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_f16(float16_t *a, float16x4x2_t b) { - // CHECK-LABEL: test_vst2_f16 vst2_f16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_f32(float32_t *a, float32x2x2_t b) { - // CHECK-LABEL: test_vst2_f32 vst2_f32(a, b); - // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st2.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_f64(float64_t *a, float64x1x2_t b) { - // CHECK-LABEL: test_vst2_f64 vst2_f64(a, b); - // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_p8(poly8_t *a, poly8x8x2_t b) { - // CHECK-LABEL: test_vst2_p8 vst2_p8(a, b); - // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_p16(poly16_t *a, poly16x4x2_t b) { - // CHECK-LABEL: test_vst2_p16 vst2_p16(a, b); - // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) { - // CHECK-LABEL: test_vst3q_u8 vst3q_u8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) { - // CHECK-LABEL: test_vst3q_u16 vst3q_u16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) { - // CHECK-LABEL: test_vst3q_u32 vst3q_u32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) { - // CHECK-LABEL: test_vst3q_u64 vst3q_u64(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_s8(int8_t *a, int8x16x3_t b) { - // CHECK-LABEL: test_vst3q_s8 vst3q_s8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_s16(int16_t *a, int16x8x3_t b) { - // CHECK-LABEL: test_vst3q_s16 vst3q_s16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_s32(int32_t *a, int32x4x3_t b) { - // CHECK-LABEL: test_vst3q_s32 vst3q_s32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_s64(int64_t *a, int64x2x3_t b) { - // CHECK-LABEL: test_vst3q_s64 vst3q_s64(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_f16(float16_t *a, float16x8x3_t b) { - // CHECK-LABEL: test_vst3q_f16 vst3q_f16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_f32(float32_t *a, float32x4x3_t b) { - // CHECK-LABEL: test_vst3q_f32 vst3q_f32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st3.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_f64(float64_t *a, float64x2x3_t b) { - // CHECK-LABEL: test_vst3q_f64 vst3q_f64(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) { - // CHECK-LABEL: test_vst3q_p8 vst3q_p8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) { - // CHECK-LABEL: test_vst3q_p16 vst3q_p16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_u8(uint8_t *a, uint8x8x3_t b) { - // CHECK-LABEL: test_vst3_u8 vst3_u8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u16(uint16_t *a, uint16x4x3_t b) { - // CHECK-LABEL: test_vst3_u16 vst3_u16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u32(uint32_t *a, uint32x2x3_t b) { - // CHECK-LABEL: test_vst3_u32 vst3_u32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u64(uint64_t *a, uint64x1x3_t b) { - // CHECK-LABEL: test_vst3_u64 vst3_u64(a, b); - // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_s8(int8_t *a, int8x8x3_t b) { - // CHECK-LABEL: test_vst3_s8 vst3_s8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s16(int16_t *a, int16x4x3_t b) { - // CHECK-LABEL: test_vst3_s16 vst3_s16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s32(int32_t *a, int32x2x3_t b) { - // CHECK-LABEL: test_vst3_s32 vst3_s32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s64(int64_t *a, int64x1x3_t b) { - // CHECK-LABEL: test_vst3_s64 vst3_s64(a, b); - // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_f16(float16_t *a, float16x4x3_t b) { - // CHECK-LABEL: test_vst3_f16 vst3_f16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_f32(float32_t *a, float32x2x3_t b) { - // CHECK-LABEL: test_vst3_f32 vst3_f32(a, b); - // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st3.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_f64(float64_t *a, float64x1x3_t b) { - // CHECK-LABEL: test_vst3_f64 vst3_f64(a, b); - // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_p8(poly8_t *a, poly8x8x3_t b) { - // CHECK-LABEL: test_vst3_p8 vst3_p8(a, b); - // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_p16(poly16_t *a, poly16x4x3_t b) { - // CHECK-LABEL: test_vst3_p16 vst3_p16(a, b); - // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) { - // CHECK-LABEL: test_vst4q_u8 vst4q_u8(a, b); - // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) { - // CHECK-LABEL: test_vst4q_u16 vst4q_u16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) { - // CHECK-LABEL: test_vst4q_u32 vst4q_u32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) { - // CHECK-LABEL: test_vst4q_u64 vst4q_u64(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_s8(int8_t *a, int8x16x4_t b) { - // CHECK-LABEL: test_vst4q_s8 vst4q_s8(a, b); - // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_s16(int16_t *a, int16x8x4_t b) { - // CHECK-LABEL: test_vst4q_s16 vst4q_s16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_s32(int32_t *a, int32x4x4_t b) { - // CHECK-LABEL: test_vst4q_s32 vst4q_s32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_s64(int64_t *a, int64x2x4_t b) { - // CHECK-LABEL: test_vst4q_s64 vst4q_s64(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_f16(float16_t *a, float16x8x4_t b) { - // CHECK-LABEL: test_vst4q_f16 vst4q_f16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_f32(float32_t *a, float32x4x4_t b) { - // CHECK-LABEL: test_vst4q_f32 vst4q_f32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st4.v2f64.p0i8(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_f64(float64_t *a, float64x2x4_t b) { - // CHECK-LABEL: test_vst4q_f64 vst4q_f64(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) { - // CHECK-LABEL: test_vst4q_p8 vst4q_p8(a, b); - // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) { - // CHECK-LABEL: test_vst4q_p16 vst4q_p16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_u8(uint8_t *a, uint8x8x4_t b) { - // CHECK-LABEL: test_vst4_u8 vst4_u8(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u16(uint16_t *a, uint16x4x4_t b) { - // CHECK-LABEL: test_vst4_u16 vst4_u16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u32(uint32_t *a, uint32x2x4_t b) { - // CHECK-LABEL: test_vst4_u32 vst4_u32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u64(uint64_t *a, uint64x1x4_t b) { - // CHECK-LABEL: test_vst4_u64 vst4_u64(a, b); - // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_s8(int8_t *a, int8x8x4_t b) { - // CHECK-LABEL: test_vst4_s8 vst4_s8(a, b); -// CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s16(int16_t *a, int16x4x4_t b) { - // CHECK-LABEL: test_vst4_s16 vst4_s16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s32(int32_t *a, int32x2x4_t b) { - // CHECK-LABEL: test_vst4_s32 vst4_s32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s64(int64_t *a, int64x1x4_t b) { - // CHECK-LABEL: test_vst4_s64 vst4_s64(a, b); - // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_f16(float16_t *a, float16x4x4_t b) { - // CHECK-LABEL: test_vst4_f16 vst4_f16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_f32(float32_t *a, float32x2x4_t b) { - // CHECK-LABEL: test_vst4_f32 vst4_f32(a, b); - // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st4.v1f64.p0i8(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_f64(float64_t *a, float64x1x4_t b) { - // CHECK-LABEL: test_vst4_f64 vst4_f64(a, b); - // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_p8(poly8_t *a, poly8x8x4_t b) { - // CHECK-LABEL: test_vst4_p8 vst4_p8(a, b); - // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_p16(poly16_t *a, poly16x4x4_t b) { - // CHECK-LABEL: test_vst4_p16 vst4_p16(a, b); - // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP4]] uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) { - // CHECK-LABEL: test_vld1q_u8_x2 return vld1q_u8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP6]] uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) { - // CHECK-LABEL: test_vld1q_u16_x2 return vld1q_u16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP6]] uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) { - // CHECK-LABEL: test_vld1q_u32_x2 return vld1q_u32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x2_t [[TMP6]] uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) { - // CHECK-LABEL: test_vld1q_u64_x2 return vld1q_u64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP4]] int8x16x2_t test_vld1q_s8_x2(int8_t const *a) { - // CHECK-LABEL: test_vld1q_s8_x2 return vld1q_s8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP6]] int16x8x2_t test_vld1q_s16_x2(int16_t const *a) { - // CHECK-LABEL: test_vld1q_s16_x2 return vld1q_s16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP6]] int32x4x2_t test_vld1q_s32_x2(int32_t const *a) { - // CHECK-LABEL: test_vld1q_s32_x2 return vld1q_s32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x2_t [[TMP6]] int64x2x2_t test_vld1q_s64_x2(int64_t const *a) { - // CHECK-LABEL: test_vld1q_s64_x2 return vld1q_s64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld1q_f16_x2(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP6]] float16x8x2_t test_vld1q_f16_x2(float16_t const *a) { - // CHECK-LABEL: test_vld1q_f16_x2 return vld1q_f16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP6]] float32x4x2_t test_vld1q_f32_x2(float32_t const *a) { - // CHECK-LABEL: test_vld1q_f32_x2 return vld1q_f32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x2_t [[TMP6]] float64x2x2_t test_vld1q_f64_x2(float64_t const *a) { - // CHECK-LABEL: test_vld1q_f64_x2 return vld1q_f64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld1q_p8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP4]] poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) { - // CHECK-LABEL: test_vld1q_p8_x2 return vld1q_p8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld1q_p16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP6]] poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) { - // CHECK-LABEL: test_vld1q_p16_x2 return vld1q_p16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld1q_p64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x2_t [[TMP6]] poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) { - // CHECK-LABEL: test_vld1q_p64_x2 return vld1q_p64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP4]] uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) { - // CHECK-LABEL: test_vld1_u8_x2 return vld1_u8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP6]] uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) { - // CHECK-LABEL: test_vld1_u16_x2 return vld1_u16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP6]] uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) { - // CHECK-LABEL: test_vld1_u32_x2 return vld1_u32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP6]] uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) { - // CHECK-LABEL: test_vld1_u64_x2 return vld1_u64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP4]] int8x8x2_t test_vld1_s8_x2(int8_t const *a) { - // CHECK-LABEL: test_vld1_s8_x2 return vld1_s8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP6]] int16x4x2_t test_vld1_s16_x2(int16_t const *a) { - // CHECK-LABEL: test_vld1_s16_x2 return vld1_s16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP6]] int32x2x2_t test_vld1_s32_x2(int32_t const *a) { - // CHECK-LABEL: test_vld1_s32_x2 return vld1_s32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP6]] int64x1x2_t test_vld1_s64_x2(int64_t const *a) { - // CHECK-LABEL: test_vld1_s64_x2 return vld1_s64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld1_f16_x2(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP6]] float16x4x2_t test_vld1_f16_x2(float16_t const *a) { - // CHECK-LABEL: test_vld1_f16_x2 return vld1_f16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP6]] float32x2x2_t test_vld1_f32_x2(float32_t const *a) { - // CHECK-LABEL: test_vld1_f32_x2 return vld1_f32_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x2_t [[TMP6]] float64x1x2_t test_vld1_f64_x2(float64_t const *a) { - // CHECK-LABEL: test_vld1_f64_x2 return vld1_f64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld1_p8_x2(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP4]] poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) { - // CHECK-LABEL: test_vld1_p8_x2 return vld1_p8_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld1_p16_x2(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP6]] poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) { - // CHECK-LABEL: test_vld1_p16_x2 return vld1_p16_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld1_p64_x2(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x2_t [[TMP6]] poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) { - // CHECK-LABEL: test_vld1_p64_x2 return vld1_p64_x2(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x3_t [[TMP4]] uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) { - // CHECK-LABEL: test_vld1q_u8_x3 return vld1q_u8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP6]] uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) { - // CHECK-LABEL: test_vld1q_u16_x3 return vld1q_u16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP6]] uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) { - // CHECK-LABEL: test_vld1q_u32_x3 return vld1q_u32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x3_t [[TMP6]] uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) { - // CHECK-LABEL: test_vld1q_u64_x3 return vld1q_u64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x3_t [[TMP4]] int8x16x3_t test_vld1q_s8_x3(int8_t const *a) { - // CHECK-LABEL: test_vld1q_s8_x3 return vld1q_s8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP6]] int16x8x3_t test_vld1q_s16_x3(int16_t const *a) { - // CHECK-LABEL: test_vld1q_s16_x3 return vld1q_s16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP6]] int32x4x3_t test_vld1q_s32_x3(int32_t const *a) { - // CHECK-LABEL: test_vld1q_s32_x3 return vld1q_s32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x3_t [[TMP6]] int64x2x3_t test_vld1q_s64_x3(int64_t const *a) { - // CHECK-LABEL: test_vld1q_s64_x3 return vld1q_s64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld1q_f16_x3(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP6]] float16x8x3_t test_vld1q_f16_x3(float16_t const *a) { - // CHECK-LABEL: test_vld1q_f16_x3 return vld1q_f16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP6]] float32x4x3_t test_vld1q_f32_x3(float32_t const *a) { - // CHECK-LABEL: test_vld1q_f32_x3 return vld1q_f32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x3_t [[TMP6]] float64x2x3_t test_vld1q_f64_x3(float64_t const *a) { - // CHECK-LABEL: test_vld1q_f64_x3 return vld1q_f64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld1q_p8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x3_t [[TMP4]] poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) { - // CHECK-LABEL: test_vld1q_p8_x3 return vld1q_p8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld1q_p16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP6]] poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) { - // CHECK-LABEL: test_vld1q_p16_x3 return vld1q_p16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld1q_p64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x3_t [[TMP6]] poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) { - // CHECK-LABEL: test_vld1q_p64_x3 return vld1q_p64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP4]] uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) { - // CHECK-LABEL: test_vld1_u8_x3 return vld1_u8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP6]] uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) { - // CHECK-LABEL: test_vld1_u16_x3 return vld1_u16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP6]] uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) { - // CHECK-LABEL: test_vld1_u32_x3 return vld1_u32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP6]] uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) { - // CHECK-LABEL: test_vld1_u64_x3 return vld1_u64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP4]] int8x8x3_t test_vld1_s8_x3(int8_t const *a) { - // CHECK-LABEL: test_vld1_s8_x3 return vld1_s8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP6]] int16x4x3_t test_vld1_s16_x3(int16_t const *a) { - // CHECK-LABEL: test_vld1_s16_x3 return vld1_s16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP6]] int32x2x3_t test_vld1_s32_x3(int32_t const *a) { - // CHECK-LABEL: test_vld1_s32_x3 return vld1_s32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP6]] int64x1x3_t test_vld1_s64_x3(int64_t const *a) { - // CHECK-LABEL: test_vld1_s64_x3 return vld1_s64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld1_f16_x3(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP6]] float16x4x3_t test_vld1_f16_x3(float16_t const *a) { - // CHECK-LABEL: test_vld1_f16_x3 return vld1_f16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP6]] float32x2x3_t test_vld1_f32_x3(float32_t const *a) { - // CHECK-LABEL: test_vld1_f32_x3 return vld1_f32_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x3_t [[TMP6]] float64x1x3_t test_vld1_f64_x3(float64_t const *a) { - // CHECK-LABEL: test_vld1_f64_x3 return vld1_f64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld1_p8_x3(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP4]] poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) { - // CHECK-LABEL: test_vld1_p8_x3 return vld1_p8_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld1_p16_x3(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP6]] poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) { - // CHECK-LABEL: test_vld1_p16_x3 return vld1_p16_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld1_p64_x3(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x3_t [[TMP6]] poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) { - // CHECK-LABEL: test_vld1_p64_x3 return vld1_p64_x3(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x4_t [[TMP4]] uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) { - // CHECK-LABEL: test_vld1q_u8_x4 return vld1q_u8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP6]] uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) { - // CHECK-LABEL: test_vld1q_u16_x4 return vld1q_u16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP6]] uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) { - // CHECK-LABEL: test_vld1q_u32_x4 return vld1q_u32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x4_t [[TMP6]] uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) { - // CHECK-LABEL: test_vld1q_u64_x4 return vld1q_u64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x4_t [[TMP4]] int8x16x4_t test_vld1q_s8_x4(int8_t const *a) { - // CHECK-LABEL: test_vld1q_s8_x4 return vld1q_s8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP6]] int16x8x4_t test_vld1q_s16_x4(int16_t const *a) { - // CHECK-LABEL: test_vld1q_s16_x4 return vld1q_s16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP6]] int32x4x4_t test_vld1q_s32_x4(int32_t const *a) { - // CHECK-LABEL: test_vld1q_s32_x4 return vld1q_s32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x4_t [[TMP6]] int64x2x4_t test_vld1q_s64_x4(int64_t const *a) { - // CHECK-LABEL: test_vld1q_s64_x4 return vld1q_s64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld1q_f16_x4(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP6]] float16x8x4_t test_vld1q_f16_x4(float16_t const *a) { - // CHECK-LABEL: test_vld1q_f16_x4 return vld1q_f16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP6]] float32x4x4_t test_vld1q_f32_x4(float32_t const *a) { - // CHECK-LABEL: test_vld1q_f32_x4 return vld1q_f32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x4_t [[TMP6]] float64x2x4_t test_vld1q_f64_x4(float64_t const *a) { - // CHECK-LABEL: test_vld1q_f64_x4 return vld1q_f64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld1q_p8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x4_t [[TMP4]] poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) { - // CHECK-LABEL: test_vld1q_p8_x4 return vld1q_p8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld1q_p16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP6]] poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) { - // CHECK-LABEL: test_vld1q_p16_x4 return vld1q_p16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld1q_p64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x4_t [[TMP6]] poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) { - // CHECK-LABEL: test_vld1q_p64_x4 return vld1q_p64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP4]] uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) { - // CHECK-LABEL: test_vld1_u8_x4 return vld1_u8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP6]] uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) { - // CHECK-LABEL: test_vld1_u16_x4 return vld1_u16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP6]] uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) { - // CHECK-LABEL: test_vld1_u32_x4 return vld1_u32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP6]] uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) { - // CHECK-LABEL: test_vld1_u64_x4 return vld1_u64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP4]] int8x8x4_t test_vld1_s8_x4(int8_t const *a) { - // CHECK-LABEL: test_vld1_s8_x4 return vld1_s8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP6]] int16x4x4_t test_vld1_s16_x4(int16_t const *a) { - // CHECK-LABEL: test_vld1_s16_x4 return vld1_s16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP6]] int32x2x4_t test_vld1_s32_x4(int32_t const *a) { - // CHECK-LABEL: test_vld1_s32_x4 return vld1_s32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP6]] int64x1x4_t test_vld1_s64_x4(int64_t const *a) { - // CHECK-LABEL: test_vld1_s64_x4 return vld1_s64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld1_f16_x4(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP6]] float16x4x4_t test_vld1_f16_x4(float16_t const *a) { - // CHECK-LABEL: test_vld1_f16_x4 return vld1_f16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP6]] float32x2x4_t test_vld1_f32_x4(float32_t const *a) { - // CHECK-LABEL: test_vld1_f32_x4 return vld1_f32_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x4_t [[TMP6]] float64x1x4_t test_vld1_f64_x4(float64_t const *a) { - // CHECK-LABEL: test_vld1_f64_x4 return vld1_f64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld1_p8_x4(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP4]] poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) { - // CHECK-LABEL: test_vld1_p8_x4 return vld1_p8_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld1_p16_x4(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP6]] poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) { - // CHECK-LABEL: test_vld1_p16_x4 return vld1_p16_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld1_p64_x4(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x4_t [[TMP6]] poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) { - // CHECK-LABEL: test_vld1_p64_x4 return vld1_p64_x4(a); - // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) { - // CHECK-LABEL: test_vst1q_u8_x2 vst1q_u8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) { - // CHECK-LABEL: test_vst1q_u16_x2 vst1q_u16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]]) +// CHECK: ret void void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) { - // CHECK-LABEL: test_vst1q_u32_x2 vst1q_u32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) { - // CHECK-LABEL: test_vst1q_u64_x2 vst1q_u64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) { - // CHECK-LABEL: test_vst1q_s8_x2 vst1q_s8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) { - // CHECK-LABEL: test_vst1q_s16_x2 vst1q_s16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]]) +// CHECK: ret void void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) { - // CHECK-LABEL: test_vst1q_s32_x2 vst1q_s32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) { - // CHECK-LABEL: test_vst1q_s64_x2 vst1q_s64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f16_x2(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) { - // CHECK-LABEL: test_vst1q_f16_x2 vst1q_f16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]]) +// CHECK: ret void void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) { - // CHECK-LABEL: test_vst1q_f32_x2 vst1q_f32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], double* [[TMP9]]) +// CHECK: ret void void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) { - // CHECK-LABEL: test_vst1q_f64_x2 vst1q_f64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) { - // CHECK-LABEL: test_vst1q_p8_x2 vst1q_p8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) { - // CHECK-LABEL: test_vst1q_p16_x2 vst1q_p16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) { - // CHECK-LABEL: test_vst1q_p64_x2 vst1q_p64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) { - // CHECK-LABEL: test_vst1_u8_x2 vst1_u8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) { - // CHECK-LABEL: test_vst1_u16_x2 vst1_u16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]]) +// CHECK: ret void void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) { - // CHECK-LABEL: test_vst1_u32_x2 vst1_u32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) { - // CHECK-LABEL: test_vst1_u64_x2 vst1_u64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) { - // CHECK-LABEL: test_vst1_s8_x2 vst1_s8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) { - // CHECK-LABEL: test_vst1_s16_x2 vst1_s16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]]) +// CHECK: ret void void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) { - // CHECK-LABEL: test_vst1_s32_x2 vst1_s32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) { - // CHECK-LABEL: test_vst1_s64_x2 vst1_s64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f16_x2(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) { - // CHECK-LABEL: test_vst1_f16_x2 vst1_f16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]]) +// CHECK: ret void void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) { - // CHECK-LABEL: test_vst1_f32_x2 vst1_f32_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], double* [[TMP9]]) +// CHECK: ret void void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) { - // CHECK-LABEL: test_vst1_f64_x2 vst1_f64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) { - // CHECK-LABEL: test_vst1_p8_x2 vst1_p8_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]]) +// CHECK: ret void void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) { - // CHECK-LABEL: test_vst1_p16_x2 vst1_p16_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]]) +// CHECK: ret void void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) { - // CHECK-LABEL: test_vst1_p64_x2 vst1_p64_x2(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) { - // CHECK-LABEL: test_vst1q_u8_x3 vst1q_u8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) { - // CHECK-LABEL: test_vst1q_u16_x3 vst1q_u16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i32* [[TMP1]]2) +// CHECK: ret void void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) { - // CHECK-LABEL: test_vst1q_u32_x3 vst1q_u32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) { - // CHECK-LABEL: test_vst1q_u64_x3 vst1q_u64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) { - // CHECK-LABEL: test_vst1q_s8_x3 vst1q_s8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) { - // CHECK-LABEL: test_vst1q_s16_x3 vst1q_s16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i32* [[TMP1]]2) +// CHECK: ret void void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) { - // CHECK-LABEL: test_vst1q_s32_x3 vst1q_s32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) { - // CHECK-LABEL: test_vst1q_s64_x3 vst1q_s64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f16_x3(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) { - // CHECK-LABEL: test_vst1q_f16_x3 vst1q_f16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, float* [[TMP1]]2) +// CHECK: ret void void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) { - // CHECK-LABEL: test_vst1q_f32_x3 vst1q_f32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, double* [[TMP1]]2) +// CHECK: ret void void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) { - // CHECK-LABEL: test_vst1q_f64_x3 vst1q_f64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) { - // CHECK-LABEL: test_vst1q_p8_x3 vst1q_p8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) { - // CHECK-LABEL: test_vst1q_p16_x3 vst1q_p16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) { - // CHECK-LABEL: test_vst1q_p64_x3 vst1q_p64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) { - // CHECK-LABEL: test_vst1_u8_x3 vst1_u8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) { - // CHECK-LABEL: test_vst1_u16_x3 vst1_u16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i32* [[TMP1]]2) +// CHECK: ret void void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) { - // CHECK-LABEL: test_vst1_u32_x3 vst1_u32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) { - // CHECK-LABEL: test_vst1_u64_x3 vst1_u64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) { - // CHECK-LABEL: test_vst1_s8_x3 vst1_s8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) { - // CHECK-LABEL: test_vst1_s16_x3 vst1_s16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i32* [[TMP1]]2) +// CHECK: ret void void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) { - // CHECK-LABEL: test_vst1_s32_x3 vst1_s32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) { - // CHECK-LABEL: test_vst1_s64_x3 vst1_s64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f16_x3(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) { - // CHECK-LABEL: test_vst1_f16_x3 vst1_f16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, float* [[TMP1]]2) +// CHECK: ret void void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) { - // CHECK-LABEL: test_vst1_f32_x3 vst1_f32_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, double* [[TMP1]]2) +// CHECK: ret void void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) { - // CHECK-LABEL: test_vst1_f64_x3 vst1_f64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) { - // CHECK-LABEL: test_vst1_p8_x3 vst1_p8_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i16* [[TMP1]]2) +// CHECK: ret void void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) { - // CHECK-LABEL: test_vst1_p16_x3 vst1_p16_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64* [[TMP1]]2) +// CHECK: ret void void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) { - // CHECK-LABEL: test_vst1_p64_x3 vst1_p64_x3(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) { - // CHECK-LABEL: test_vst1q_u8_x4 vst1q_u8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) { - // CHECK-LABEL: test_vst1q_u16_x4 vst1q_u16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i32* [[TMP1]]5) +// CHECK: ret void void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) { - // CHECK-LABEL: test_vst1q_u32_x4 vst1q_u32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_u64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) { - // CHECK-LABEL: test_vst1q_u64_x4 vst1q_u64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) { - // CHECK-LABEL: test_vst1q_s8_x4 vst1q_s8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) { - // CHECK-LABEL: test_vst1q_s16_x4 vst1q_s16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i32* [[TMP1]]5) +// CHECK: ret void void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) { - // CHECK-LABEL: test_vst1q_s32_x4 vst1q_s32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) { - // CHECK-LABEL: test_vst1q_s64_x4 vst1q_s64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f16_x4(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) { - // CHECK-LABEL: test_vst1q_f16_x4 vst1q_f16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, float* [[TMP1]]5) +// CHECK: ret void void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) { - // CHECK-LABEL: test_vst1q_f32_x4 vst1q_f32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, double* [[TMP1]]5) +// CHECK: ret void void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) { - // CHECK-LABEL: test_vst1q_f64_x4 vst1q_f64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) { - // CHECK-LABEL: test_vst1q_p8_x4 vst1q_p8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) { - // CHECK-LABEL: test_vst1q_p16_x4 vst1q_p16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) { - // CHECK-LABEL: test_vst1q_p64_x4 vst1q_p64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) { - // CHECK-LABEL: test_vst1_u8_x4 vst1_u8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) { - // CHECK-LABEL: test_vst1_u16_x4 vst1_u16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i32* [[TMP1]]5) +// CHECK: ret void void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) { - // CHECK-LABEL: test_vst1_u32_x4 vst1_u32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_u64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) { - // CHECK-LABEL: test_vst1_u64_x4 vst1_u64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) { - // CHECK-LABEL: test_vst1_s8_x4 vst1_s8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) { - // CHECK-LABEL: test_vst1_s16_x4 vst1_s16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i32* [[TMP1]]5) +// CHECK: ret void void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) { - // CHECK-LABEL: test_vst1_s32_x4 vst1_s32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) { - // CHECK-LABEL: test_vst1_s64_x4 vst1_s64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f16_x4(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) { - // CHECK-LABEL: test_vst1_f16_x4 vst1_f16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float* +// CHECK: call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, float* [[TMP1]]5) +// CHECK: ret void void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) { - // CHECK-LABEL: test_vst1_f32_x4 vst1_f32_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double* +// CHECK: call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, double* [[TMP1]]5) +// CHECK: ret void void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) { - // CHECK-LABEL: test_vst1_f64_x4 vst1_f64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) { - // CHECK-LABEL: test_vst1_p8_x4 vst1_p8_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16* +// CHECK: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i16* [[TMP1]]5) +// CHECK: ret void void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) { - // CHECK-LABEL: test_vst1_p16_x4 vst1_p16_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64* +// CHECK: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64* [[TMP1]]5) +// CHECK: ret void void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) { - // CHECK-LABEL: test_vst1_p64_x4 vst1_p64_x4(a, b); - // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define i64 @test_vceqd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] int64_t test_vceqd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vceqd_s64 -// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vceqd_s64(a, b); } +// CHECK-LABEL: define i64 @test_vceqd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] uint64_t test_vceqd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vceqd_u64 -// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vceqd_u64(a, b); } +// CHECK-LABEL: define i64 @test_vceqzd_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 +// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQZ_I]] int64_t test_vceqzd_s64(int64_t a) { -// CHECK-LABEL: test_vceqzd_s64 -// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}} return (int64_t)vceqzd_s64(a); } +// CHECK-LABEL: define i64 @test_vceqzd_u64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 +// CHECK: [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQZD_I]] int64_t test_vceqzd_u64(int64_t a) { -// CHECK-LABEL: test_vceqzd_u64 -// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}} return (int64_t)vceqzd_u64(a); } +// CHECK-LABEL: define i64 @test_vcged_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] int64_t test_vcged_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vcged_s64 -// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vcged_s64(a, b); } +// CHECK-LABEL: define i64 @test_vcged_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp uge i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] uint64_t test_vcged_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vcged_u64 -// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (uint64_t)vcged_u64(a, b); } +// CHECK-LABEL: define i64 @test_vcgezd_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, 0 +// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCGEZ_I]] int64_t test_vcgezd_s64(int64_t a) { -// CHECK-LABEL: test_vcgezd_s64 -// CHECK: {{cmge d[0-9]+, d[0-9]+, #0x0|eor x0, x[0-9]+, x0, asr #63}} return (int64_t)vcgezd_s64(a); } +// CHECK-LABEL: define i64 @test_vcgtd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] int64_t test_vcgtd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vcgtd_s64 -// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vcgtd_s64(a, b); } +// CHECK-LABEL: define i64 @test_vcgtd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp ugt i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vcgtd_u64 -// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (uint64_t)vcgtd_u64(a, b); } +// CHECK-LABEL: define i64 @test_vcgtzd_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, 0 +// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCGTZ_I]] int64_t test_vcgtzd_s64(int64_t a) { -// CHECK-LABEL: test_vcgtzd_s64 -// CHECK: {{cmgt d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}} return (int64_t)vcgtzd_s64(a); } +// CHECK-LABEL: define i64 @test_vcled_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] int64_t test_vcled_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vcled_s64 -// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vcled_s64(a, b); } +// CHECK-LABEL: define i64 @test_vcled_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp ule i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] uint64_t test_vcled_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vcled_u64 -// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (uint64_t)vcled_u64(a, b); } +// CHECK-LABEL: define i64 @test_vclezd_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, 0 +// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCLEZ_I]] int64_t test_vclezd_s64(int64_t a) { -// CHECK-LABEL: test_vclezd_s64 -// CHECK: {{cmle d[0-9]+, d[0-9]+, #0x0|cmp x0, #1}} return (int64_t)vclezd_s64(a); } +// CHECK-LABEL: define i64 @test_vcltd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] int64_t test_vcltd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vcltd_s64 -// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (int64_t)vcltd_s64(a, b); } +// CHECK-LABEL: define i64 @test_vcltd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = icmp ult i64 %a, %b +// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQD_I]] uint64_t test_vcltd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vcltd_u64 -// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}} return (uint64_t)vcltd_u64(a, b); } +// CHECK-LABEL: define i64 @test_vcltzd_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, 0 +// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCLTZ_I]] int64_t test_vcltzd_s64(int64_t a) { -// CHECK-LABEL: test_vcltzd_s64 -// CHECK: {{cmlt d[0-9]+, d[0-9]+, #0x0|asr x0, x0, #63}} return (int64_t)vcltzd_s64(a); } +// CHECK-LABEL: define i64 @test_vtstd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = and i64 %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK: ret i64 [[VTSTD_I]] int64_t test_vtstd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vtstd_s64 -// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}} return (int64_t)vtstd_s64(a, b); } +// CHECK-LABEL: define i64 @test_vtstd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = and i64 %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK: ret i64 [[VTSTD_I]] uint64_t test_vtstd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vtstd_u64 -// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}} return (uint64_t)vtstd_u64(a, b); } +// CHECK-LABEL: define i64 @test_vabsd_s64(i64 %a) #0 { +// CHECK: [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4 +// CHECK: ret i64 [[VABSD_S64_I]] int64_t test_vabsd_s64(int64_t a) { -// CHECK-LABEL: test_vabsd_s64 -// CHECK: abs {{d[0-9]+}}, {{d[0-9]+}} return (int64_t)vabsd_s64(a); } +// CHECK-LABEL: define i8 @test_vqabsb_s8(i8 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqabsb_s8(int8_t a) { -// CHECK-LABEL: test_vqabsb_s8 -// CHECK: sqabs {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} return (int8_t)vqabsb_s8(a); } +// CHECK-LABEL: define i16 @test_vqabsh_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqabsh_s16(int16_t a) { -// CHECK-LABEL: test_vqabsh_s16 -// CHECK: sqabs {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} return (int16_t)vqabsh_s16(a); } +// CHECK-LABEL: define i32 @test_vqabss_s32(i32 %a) #0 { +// CHECK: [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4 +// CHECK: ret i32 [[VQABSS_S32_I]] int32_t test_vqabss_s32(int32_t a) { -// CHECK-LABEL: test_vqabss_s32 -// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}} return (int32_t)vqabss_s32(a); } +// CHECK-LABEL: define i64 @test_vqabsd_s64(i64 %a) #0 { +// CHECK: [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4 +// CHECK: ret i64 [[VQABSD_S64_I]] int64_t test_vqabsd_s64(int64_t a) { -// CHECK-LABEL: test_vqabsd_s64 -// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}} return (int64_t)vqabsd_s64(a); } +// CHECK-LABEL: define i64 @test_vnegd_s64(i64 %a) #0 { +// CHECK: [[VNEGD_I:%.*]] = sub i64 0, %a +// CHECK: ret i64 [[VNEGD_I]] int64_t test_vnegd_s64(int64_t a) { -// CHECK-LABEL: test_vnegd_s64 -// CHECK: neg {{[xd][0-9]+}}, {{[xd][0-9]+}} return (int64_t)vnegd_s64(a); } +// CHECK-LABEL: define i8 @test_vqnegb_s8(i8 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqnegb_s8(int8_t a) { -// CHECK-LABEL: test_vqnegb_s8 -// CHECK: sqneg {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} return (int8_t)vqnegb_s8(a); } +// CHECK-LABEL: define i16 @test_vqnegh_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqnegh_s16(int16_t a) { -// CHECK-LABEL: test_vqnegh_s16 -// CHECK: sqneg {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} return (int16_t)vqnegh_s16(a); } +// CHECK-LABEL: define i32 @test_vqnegs_s32(i32 %a) #0 { +// CHECK: [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4 +// CHECK: ret i32 [[VQNEGS_S32_I]] int32_t test_vqnegs_s32(int32_t a) { -// CHECK-LABEL: test_vqnegs_s32 -// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}} return (int32_t)vqnegs_s32(a); } +// CHECK-LABEL: define i64 @test_vqnegd_s64(i64 %a) #0 { +// CHECK: [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4 +// CHECK: ret i64 [[VQNEGD_S64_I]] int64_t test_vqnegd_s64(int64_t a) { -// CHECK-LABEL: test_vqnegd_s64 -// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}} return (int64_t)vqnegd_s64(a); } +// CHECK-LABEL: define i8 @test_vuqaddb_s8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] int8_t test_vuqaddb_s8(int8_t a, int8_t b) { -// CHECK-LABEL: test_vuqaddb_s8 -// CHECK: suqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} return (int8_t)vuqaddb_s8(a, b); } +// CHECK-LABEL: define i16 @test_vuqaddh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] int16_t test_vuqaddh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vuqaddh_s16 -// CHECK: suqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} return (int16_t)vuqaddh_s16(a, b); } +// CHECK-LABEL: define i32 @test_vuqadds_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VUQADDS_S32_I]] int32_t test_vuqadds_s32(int32_t a, int32_t b) { -// CHECK-LABEL: test_vuqadds_s32 -// CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}} return (int32_t)vuqadds_s32(a, b); } +// CHECK-LABEL: define i64 @test_vuqaddd_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VUQADDD_S64_I]] int64_t test_vuqaddd_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vuqaddd_s64 -// CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}} return (int64_t)vuqaddd_s64(a, b); } +// CHECK-LABEL: define i8 @test_vsqaddb_u8(i8 %a, i8 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0 +// CHECK: [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0 +// CHECK: ret i8 [[TMP2]] uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) { -// CHECK-LABEL: test_vsqaddb_u8 -// CHECK: usqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}} return (uint8_t)vsqaddb_u8(a, b); } +// CHECK-LABEL: define i16 @test_vsqaddh_u16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0 +// CHECK: ret i16 [[TMP2]] uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) { -// CHECK-LABEL: test_vsqaddh_u16 -// CHECK: usqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} return (uint16_t)vsqaddh_u16(a, b); } +// CHECK-LABEL: define i32 @test_vsqadds_u32(i32 %a, i32 %b) #0 { +// CHECK: [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4 +// CHECK: ret i32 [[VSQADDS_U32_I]] uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) { -// CHECK-LABEL: test_vsqadds_u32 -// CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}} return (uint32_t)vsqadds_u32(a, b); } +// CHECK-LABEL: define i64 @test_vsqaddd_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4 +// CHECK: ret i64 [[VSQADDD_U64_I]] uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vsqaddd_u64 -// CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}} return (uint64_t)vsqaddd_u64(a, b); } +// CHECK-LABEL: define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0 +// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4 +// CHECK: ret i32 [[VQDMLXL1_I]] int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { -// CHECK-ARM64-LABEL: test_vqdmlalh_s16 -// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}} -// CHECK-ARM64: sqadd {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]] return (int32_t)vqdmlalh_s16(a, b, c); } +// CHECK-LABEL: define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) #0 { +// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4 +// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4 +// CHECK: ret i64 [[VQDMLXL1_I]] int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { -// CHECK-LABEL: test_vqdmlals_s32 -// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} return (int64_t)vqdmlals_s32(a, b, c); } +// CHECK-LABEL: define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0 +// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4 +// CHECK: ret i32 [[VQDMLXL1_I]] int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) { -// CHECK-ARM64-LABEL: test_vqdmlslh_s16 -// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}} -// CHECK-ARM64: sqsub {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]] return (int32_t)vqdmlslh_s16(a, b, c); } +// CHECK-LABEL: define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) #0 { +// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4 +// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4 +// CHECK: ret i64 [[VQDMLXL1_I]] int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) { -// CHECK-LABEL: test_vqdmlsls_s32 -// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} return (int64_t)vqdmlsls_s32(a, b, c); } +// CHECK-LABEL: define i32 @test_vqdmullh_s16(i16 %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4 +// CHECK: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK: ret i32 [[TMP2]] int32_t test_vqdmullh_s16(int16_t a, int16_t b) { -// CHECK-LABEL: test_vqdmullh_s16 -// CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}} return (int32_t)vqdmullh_s16(a, b); } +// CHECK-LABEL: define i64 @test_vqdmulls_s32(i32 %a, i32 %b) #0 { +// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4 +// CHECK: ret i64 [[VQDMULLS_S32_I]] int64_t test_vqdmulls_s32(int32_t a, int32_t b) { -// CHECK-LABEL: test_vqdmulls_s32 -// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} return (int64_t)vqdmulls_s32(a, b); } +// CHECK-LABEL: define i8 @test_vqmovunh_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqmovunh_s16(int16_t a) { -// CHECK-LABEL: test_vqmovunh_s16 -// CHECK: sqxtun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}} return (int8_t)vqmovunh_s16(a); } +// CHECK-LABEL: define i16 @test_vqmovuns_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqmovuns_s32(int32_t a) { -// CHECK-LABEL: test_vqmovuns_s32 -// CHECK: sqxtun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}} return (int16_t)vqmovuns_s32(a); } +// CHECK-LABEL: define i32 @test_vqmovund_s64(i64 %a) #0 { +// CHECK: [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4 +// CHECK: ret i32 [[VQMOVUND_S64_I]] int32_t test_vqmovund_s64(int64_t a) { -// CHECK-LABEL: test_vqmovund_s64 -// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}} return (int32_t)vqmovund_s64(a); } +// CHECK-LABEL: define i8 @test_vqmovnh_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqmovnh_s16(int16_t a) { -// CHECK-LABEL: test_vqmovnh_s16 -// CHECK: sqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}} return (int8_t)vqmovnh_s16(a); } +// CHECK-LABEL: define i16 @test_vqmovns_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqmovns_s32(int32_t a) { -// CHECK-LABEL: test_vqmovns_s32 -// CHECK: sqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}} return (int16_t)vqmovns_s32(a); } +// CHECK-LABEL: define i32 @test_vqmovnd_s64(i64 %a) #0 { +// CHECK: [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4 +// CHECK: ret i32 [[VQMOVND_S64_I]] int32_t test_vqmovnd_s64(int64_t a) { -// CHECK-LABEL: test_vqmovnd_s64 -// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}} return (int32_t)vqmovnd_s64(a); } +// CHECK-LABEL: define i8 @test_vqmovnh_u16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqmovnh_u16(int16_t a) { -// CHECK-LABEL: test_vqmovnh_u16 -// CHECK: uqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}} return (int8_t)vqmovnh_u16(a); } +// CHECK-LABEL: define i16 @test_vqmovns_u32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4 +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqmovns_u32(int32_t a) { -// CHECK-LABEL: test_vqmovns_u32 -// CHECK: uqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}} return (int16_t)vqmovns_u32(a); } +// CHECK-LABEL: define i32 @test_vqmovnd_u64(i64 %a) #0 { +// CHECK: [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4 +// CHECK: ret i32 [[VQMOVND_U64_I]] int32_t test_vqmovnd_u64(int64_t a) { -// CHECK-LABEL: test_vqmovnd_u64 -// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}} return (int32_t)vqmovnd_u64(a); } +// CHECK-LABEL: define i32 @test_vceqs_f32(float %a, float %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCMPD_I]] uint32_t test_vceqs_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vceqs_f32 -// CHECK: {{fcmeq s0, s0, s1|fcmp s0, s1}} return (uint32_t)vceqs_f32(a, b); } +// CHECK-LABEL: define i64 @test_vceqd_f64(double %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCMPD_I]] uint64_t test_vceqd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vceqd_f64 -// CHECK: {{fcmeq d0, d0, d1|fcmp d0, d1}} return (uint64_t)vceqd_f64(a, b); } +// CHECK-LABEL: define i32 @test_vceqzs_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00 +// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCEQZ_I]] uint32_t test_vceqzs_f32(float32_t a) { -// CHECK-LABEL: test_vceqzs_f32 -// CHECK: {{fcmeq s0, s0, #0.0|fcmp s0, #0.0}} return (uint32_t)vceqzs_f32(a); } +// CHECK-LABEL: define i64 @test_vceqzd_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00 +// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCEQZ_I]] uint64_t test_vceqzd_f64(float64_t a) { -// CHECK-LABEL: test_vceqzd_f64 -// CHECK: {{fcmeq d0, d0, #0.0|fcmp d0, #0.0}} return (uint64_t)vceqzd_f64(a); } +// CHECK-LABEL: define i32 @test_vcges_f32(float %a, float %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oge float %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCMPD_I]] uint32_t test_vcges_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcges_f32 -// CHECK: {{fcmge s0, s0, s1|fcmp s0, s1}} return (uint32_t)vcges_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcged_f64(double %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oge double %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCMPD_I]] uint64_t test_vcged_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcged_f64 -// CHECK: {{fcmge d0, d0, d1|fcmp d0, d1}} return (uint64_t)vcged_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcgezs_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00 +// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCGEZ_I]] uint32_t test_vcgezs_f32(float32_t a) { -// CHECK-LABEL: test_vcgezs_f32 -// CHECK: {{fcmge s0, s0, #0.0|fcmp s0, #0.0}} return (uint32_t)vcgezs_f32(a); } +// CHECK-LABEL: define i64 @test_vcgezd_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00 +// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCGEZ_I]] uint64_t test_vcgezd_f64(float64_t a) { -// CHECK-LABEL: test_vcgezd_f64 -// CHECK: {{fcmge d0, d0, #0.0|fcmp d0, #0.0}} return (uint64_t)vcgezd_f64(a); } +// CHECK-LABEL: define i32 @test_vcgts_f32(float %a, float %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCMPD_I]] uint32_t test_vcgts_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcgts_f32 -// CHECK: {{fcmgt s0, s0, s1|fcmp s0, s1}} return (uint32_t)vcgts_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcgtd_f64(double %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCMPD_I]] uint64_t test_vcgtd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcgtd_f64 -// CHECK: {{fcmgt d0, d0, d1|fcmp d0, d1}} return (uint64_t)vcgtd_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcgtzs_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00 +// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCGTZ_I]] uint32_t test_vcgtzs_f32(float32_t a) { -// CHECK-LABEL: test_vcgtzs_f32 -// CHECK: {{fcmgt s0, s0, #0.0|fcmp s0, #0.0}} return (uint32_t)vcgtzs_f32(a); } +// CHECK-LABEL: define i64 @test_vcgtzd_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00 +// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCGTZ_I]] uint64_t test_vcgtzd_f64(float64_t a) { -// CHECK-LABEL: test_vcgtzd_f64 -// CHECK: {{fcmgt d0, d0, #0.0|fcmp d0, #0.0}} return (uint64_t)vcgtzd_f64(a); } +// CHECK-LABEL: define i32 @test_vcles_f32(float %a, float %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ole float %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCMPD_I]] uint32_t test_vcles_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcles_f32 -// CHECK: {{fcmge s0, s1, s0|fcmp s0, s1}} return (uint32_t)vcles_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcled_f64(double %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ole double %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCMPD_I]] uint64_t test_vcled_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcled_f64 -// CHECK: {{fcmge d0, d1, d0|fcmp d0, d1}} return (uint64_t)vcled_f64(a, b); } +// CHECK-LABEL: define i32 @test_vclezs_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00 +// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCLEZ_I]] uint32_t test_vclezs_f32(float32_t a) { -// CHECK-LABEL: test_vclezs_f32 -// CHECK: {{fcmle s0, s0, #0.0|fcmp s0, #0.0}} return (uint32_t)vclezs_f32(a); } +// CHECK-LABEL: define i64 @test_vclezd_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00 +// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCLEZ_I]] uint64_t test_vclezd_f64(float64_t a) { -// CHECK-LABEL: test_vclezd_f64 -// CHECK: {{fcmle d0, d0, #0.0|fcmp d0, #0.0}} return (uint64_t)vclezd_f64(a); } +// CHECK-LABEL: define i32 @test_vclts_f32(float %a, float %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp olt float %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCMPD_I]] uint32_t test_vclts_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vclts_f32 -// CHECK: {{fcmgt s0, s1, s0|fcmp s0, s1}} return (uint32_t)vclts_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcltd_f64(double %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = fcmp olt double %a, %b +// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCMPD_I]] uint64_t test_vcltd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcltd_f64 -// CHECK: {{fcmgt d0, d1, d0|fcmp d0, d1}} return (uint64_t)vcltd_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcltzs_f32(float %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00 +// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK: ret i32 [[VCLTZ_I]] uint32_t test_vcltzs_f32(float32_t a) { -// CHECK-LABEL: test_vcltzs_f32 -// CHECK: {{fcmlt s0, s0, #0.0|fcmp s0, #0.0}} return (uint32_t)vcltzs_f32(a); } +// CHECK-LABEL: define i64 @test_vcltzd_f64(double %a) #0 { +// CHECK: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00 +// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK: ret i64 [[VCLTZ_I]] uint64_t test_vcltzd_f64(float64_t a) { -// CHECK-LABEL: test_vcltzd_f64 -// CHECK: {{fcmlt d0, d0, #0.0|fcmp d0, #0.0}} return (uint64_t)vcltzd_f64(a); } +// CHECK-LABEL: define i32 @test_vcages_f32(float %a, float %b) #0 { +// CHECK: [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4 +// CHECK: ret i32 [[VCAGES_F32_I]] uint32_t test_vcages_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcages_f32 -// CHECK: facge s0, s0, s1 return (uint32_t)vcages_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcaged_f64(double %a, double %b) #0 { +// CHECK: [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4 +// CHECK: ret i64 [[VCAGED_F64_I]] uint64_t test_vcaged_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcaged_f64 -// CHECK: facge d0, d0, d1 return (uint64_t)vcaged_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcagts_f32(float %a, float %b) #0 { +// CHECK: [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4 +// CHECK: ret i32 [[VCAGTS_F32_I]] uint32_t test_vcagts_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcagts_f32 -// CHECK: facgt s0, s0, s1 return (uint32_t)vcagts_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcagtd_f64(double %a, double %b) #0 { +// CHECK: [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4 +// CHECK: ret i64 [[VCAGTD_F64_I]] uint64_t test_vcagtd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcagtd_f64 -// CHECK: facgt d0, d0, d1 return (uint64_t)vcagtd_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcales_f32(float %a, float %b) #0 { +// CHECK: [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4 +// CHECK: ret i32 [[VCALES_F32_I]] uint32_t test_vcales_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcales_f32 -// CHECK: facge s0, s1, s0 return (uint32_t)vcales_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcaled_f64(double %a, double %b) #0 { +// CHECK: [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4 +// CHECK: ret i64 [[VCALED_F64_I]] uint64_t test_vcaled_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcaled_f64 -// CHECK: facge d0, d1, d0 return (uint64_t)vcaled_f64(a, b); } +// CHECK-LABEL: define i32 @test_vcalts_f32(float %a, float %b) #0 { +// CHECK: [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4 +// CHECK: ret i32 [[VCALTS_F32_I]] uint32_t test_vcalts_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vcalts_f32 -// CHECK: facgt s0, s1, s0 return (uint32_t)vcalts_f32(a, b); } +// CHECK-LABEL: define i64 @test_vcaltd_f64(double %a, double %b) #0 { +// CHECK: [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4 +// CHECK: ret i64 [[VCALTD_F64_I]] uint64_t test_vcaltd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vcaltd_f64 -// CHECK: facgt d0, d1, d0 return (uint64_t)vcaltd_f64(a, b); } +// CHECK-LABEL: define i64 @test_vshrd_n_s64(i64 %a) #0 { +// CHECK: [[SHRD_N:%.*]] = ashr i64 %a, 1 +// CHECK: ret i64 [[SHRD_N]] int64_t test_vshrd_n_s64(int64_t a) { -// CHECK-LABEL: test_vshrd_n_s64 -// CHECK: {{sshr d[0-9]+, d[0-9]+, #1|asr x0, x0, #1}} return (int64_t)vshrd_n_s64(a, 1); } +// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHR_N]] int64x1_t test_vshr_n_s64(int64x1_t a) { -// CHECK-LABEL: test_vshr_n_s64 -// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #1 return vshr_n_s64(a, 1); } +// CHECK-LABEL: define i64 @test_vshrd_n_u64(i64 %a) #0 { +// CHECK: ret i64 0 uint64_t test_vshrd_n_u64(uint64_t a) { -// CHECK-ARM64-LABEL: test_vshrd_n_u64 -// CHECK-ARM64: mov x0, xzr return (uint64_t)vshrd_n_u64(a, 64); } +// CHECK-LABEL: define i64 @test_vshrd_n_u64_2() #0 { +// CHECK: ret i64 0 uint64_t test_vshrd_n_u64_2() { -// CHECK-ARM64-LABEL: test_vshrd_n_u64_2 -// CHECK-ARM64: mov x0, xzr uint64_t a = UINT64_C(0xf000000000000000); return vshrd_n_u64(a, 64); } +// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHR_N]] uint64x1_t test_vshr_n_u64(uint64x1_t a) { -// CHECK-LABEL: test_vshr_n_u64 -// CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #1 return vshr_n_u64(a, 1); } +// CHECK-LABEL: define i64 @test_vrshrd_n_s64(i64 %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63) +// CHECK: ret i64 [[VRSHR_N]] int64_t test_vrshrd_n_s64(int64_t a) { -// CHECK-LABEL: test_vrshrd_n_s64 -// CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63 return (int64_t)vrshrd_n_s64(a, 63); } +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VRSHR_N]]1 int64x1_t test_vrshr_n_s64(int64x1_t a) { -// CHECK-LABEL: test_vrshr_n_s64 -// CHECK: srshr d{{[0-9]+}}, d{{[0-9]+}}, #1 return vrshr_n_s64(a, 1); } +// CHECK-LABEL: define i64 @test_vrshrd_n_u64(i64 %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63) +// CHECK: ret i64 [[VRSHR_N]] uint64_t test_vrshrd_n_u64(uint64_t a) { -// CHECK-LABEL: test_vrshrd_n_u64 -// CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63 return (uint64_t)vrshrd_n_u64(a, 63); } +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VRSHR_N]]1 uint64x1_t test_vrshr_n_u64(uint64x1_t a) { -// CHECK-LABEL: test_vrshr_n_u64 -// CHECK: urshr d{{[0-9]+}}, d{{[0-9]+}}, #1 return vrshr_n_u64(a, 1); } +// CHECK-LABEL: define i64 @test_vsrad_n_s64(i64 %a, i64 %b) #0 { +// CHECK: [[SHRD_N:%.*]] = ashr i64 %b, 63 +// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] +// CHECK: ret i64 [[TMP0]] int64_t test_vsrad_n_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vsrad_n_s64 -// CHECK: {{ssra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, asr #63}} return (int64_t)vsrad_n_s64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <1 x i64> [[TMP4]] int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vsra_n_s64 -// CHECK: ssra d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsra_n_s64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vsrad_n_u64(i64 %a, i64 %b) #0 { +// CHECK: [[SHRD_N:%.*]] = lshr i64 %b, 63 +// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] +// CHECK: ret i64 [[TMP0]] uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vsrad_n_u64 -// CHECK: {{usra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, lsr #63}} return (uint64_t)vsrad_n_u64(a, b, 63); } +// CHECK-LABEL: define i64 @test_vsrad_n_u64_2(i64 %a, i64 %b) #0 { +// CHECK: ret i64 %a uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) { -// CHECK-ARM64-LABEL: test_vsrad_n_u64_2 -// CHECK-ARM64-NOT: add return (uint64_t)vsrad_n_u64(a, b, 64); } +// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <1 x i64> [[TMP4]] uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { -// CHECK-LABEL: test_vsra_n_u64 -// CHECK: usra d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsra_n_u64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63) +// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] +// CHECK: ret i64 [[TMP1]] int64_t test_vrsrad_n_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vrsrad_n_s64 -// CHECK: {{srsra d[0-9]+, d[0-9]+, #63}} return (int64_t)vrsrad_n_s64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <1 x i64> [[TMP3]] int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vrsra_n_s64 -// CHECK: srsra d{{[0-9]+}}, d{{[0-9]+}}, #1 return vrsra_n_s64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) #0 { +// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63) +// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] +// CHECK: ret i64 [[TMP1]] uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vrsrad_n_u64 -// CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63 return (uint64_t)vrsrad_n_u64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <1 x i64> [[TMP3]] uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { -// CHECK-LABEL: test_vrsra_n_u64 -// CHECK: ursra d{{[0-9]+}}, d{{[0-9]+}}, #1 return vrsra_n_u64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vshld_n_s64(i64 %a) #0 { +// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 1 +// CHECK: ret i64 [[SHLD_N]] int64_t test_vshld_n_s64(int64_t a) { -// CHECK-LABEL: test_vshld_n_s64 -// CHECK: {{shl d[0-9]+, d[0-9]+, #1|lsl x0, x0, #1}} return (int64_t)vshld_n_s64(a, 1); } +// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHL_N]] int64x1_t test_vshl_n_s64(int64x1_t a) { -// CHECK-LABEL: test_vshl_n_s64 -// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1 return vshl_n_s64(a, 1); } +// CHECK-LABEL: define i64 @test_vshld_n_u64(i64 %a) #0 { +// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 63 +// CHECK: ret i64 [[SHLD_N]] uint64_t test_vshld_n_u64(uint64_t a) { -// CHECK-LABEL: test_vshld_n_u64 -// CHECK: {{shl d[0-9]+, d[0-9]+, #63|lsl x0, x0, #63}} return (uint64_t)vshld_n_u64(a, 63); } +// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHL_N]] uint64x1_t test_vshl_n_u64(uint64x1_t a) { -// CHECK-LABEL: test_vshl_n_u64 -// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1 return vshl_n_u64(a, 1); } +// CHECK-LABEL: define i8 @test_vqshlb_n_s8(i8 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqshlb_n_s8(int8_t a) { -// CHECK-LABEL: test_vqshlb_n_s8 -// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7 return (int8_t)vqshlb_n_s8(a, 7); } +// CHECK-LABEL: define i16 @test_vqshlh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqshlh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqshlh_n_s16 -// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15 return (int16_t)vqshlh_n_s16(a, 15); } +// CHECK-LABEL: define i32 @test_vqshls_n_s32(i32 %a) #0 { +// CHECK: [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31) +// CHECK: ret i32 [[VQSHLS_N_S32]] int32_t test_vqshls_n_s32(int32_t a) { -// CHECK-LABEL: test_vqshls_n_s32 -// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31 return (int32_t)vqshls_n_s32(a, 31); } +// CHECK-LABEL: define i64 @test_vqshld_n_s64(i64 %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63) +// CHECK: ret i64 [[VQSHL_N]] int64_t test_vqshld_n_s64(int64_t a) { -// CHECK-LABEL: test_vqshld_n_s64 -// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63 return (int64_t)vqshld_n_s64(a, 63); } +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) +// CHECK: ret <8 x i8> [[VQSHL_N]] int8x8_t test_vqshl_n_s8(int8x8_t a) { - // CHECK-LABEL: test_vqshl_n_s8 return vqshl_n_s8(a, 0); - // CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 } +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) +// CHECK: ret <16 x i8> [[VQSHL_N]] int8x16_t test_vqshlq_n_s8(int8x16_t a) { - // CHECK-LABEL: test_vqshlq_n_s8 return vqshlq_n_s8(a, 0); - // CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0 } +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) +// CHECK: ret <4 x i16> [[VQSHL_N]]1 int16x4_t test_vqshl_n_s16(int16x4_t a) { - // CHECK-LABEL: test_vqshl_n_s16 return vqshl_n_s16(a, 0); - // CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0 } +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) +// CHECK: ret <8 x i16> [[VQSHL_N]]1 int16x8_t test_vqshlq_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vqshlq_n_s16 return vqshlq_n_s16(a, 0); - // CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0 } +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) +// CHECK: ret <2 x i32> [[VQSHL_N]]1 int32x2_t test_vqshl_n_s32(int32x2_t a) { - // CHECK-LABEL: test_vqshl_n_s32 return vqshl_n_s32(a, 0); - // CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 } +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) +// CHECK: ret <4 x i32> [[VQSHL_N]]1 int32x4_t test_vqshlq_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vqshlq_n_s32 return vqshlq_n_s32(a, 0); - // CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 } +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) +// CHECK: ret <2 x i64> [[VQSHL_N]]1 int64x2_t test_vqshlq_n_s64(int64x2_t a) { - // CHECK-LABEL: test_vqshlq_n_s64 return vqshlq_n_s64(a, 0); - // CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 } +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) +// CHECK: ret <8 x i8> [[VQSHL_N]] uint8x8_t test_vqshl_n_u8(uint8x8_t a) { - // CHECK-LABEL: test_vqshl_n_u8 return vqshl_n_u8(a, 0); - // CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 } +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) +// CHECK: ret <16 x i8> [[VQSHL_N]] uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { - // CHECK-LABEL: test_vqshlq_n_u8 return vqshlq_n_u8(a, 0); - // CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0 } +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) +// CHECK: ret <4 x i16> [[VQSHL_N]]1 uint16x4_t test_vqshl_n_u16(uint16x4_t a) { - // CHECK-LABEL: test_vqshl_n_u16 return vqshl_n_u16(a, 0); - // CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0 } +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) +// CHECK: ret <8 x i16> [[VQSHL_N]]1 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vqshlq_n_u16 return vqshlq_n_u16(a, 0); - // CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0 } +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) +// CHECK: ret <2 x i32> [[VQSHL_N]]1 uint32x2_t test_vqshl_n_u32(uint32x2_t a) { - // CHECK-LABEL: test_vqshl_n_u32 return vqshl_n_u32(a, 0); - // CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 } +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) +// CHECK: ret <4 x i32> [[VQSHL_N]]1 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vqshlq_n_u32 return vqshlq_n_u32(a, 0); - // CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 } +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) +// CHECK: ret <2 x i64> [[VQSHL_N]]1 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { - // CHECK-LABEL: test_vqshlq_n_u64 return vqshlq_n_u64(a, 0); - // CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 } +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHL_N]]1 int64x1_t test_vqshl_n_s64(int64x1_t a) { -// CHECK-LABEL: test_vqshl_n_s64 -// CHECK: sqshl d{{[0-9]+}}, d{{[0-9]+}}, #1 return vqshl_n_s64(a, 1); } +// CHECK-LABEL: define i8 @test_vqshlb_n_u8(i8 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0 +// CHECK: ret i8 [[TMP1]] uint8_t test_vqshlb_n_u8(uint8_t a) { -// CHECK-LABEL: test_vqshlb_n_u8 -// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7 return (uint8_t)vqshlb_n_u8(a, 7); } +// CHECK-LABEL: define i16 @test_vqshlh_n_u16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0 +// CHECK: ret i16 [[TMP1]] uint16_t test_vqshlh_n_u16(uint16_t a) { -// CHECK-LABEL: test_vqshlh_n_u16 -// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15 return (uint16_t)vqshlh_n_u16(a, 15); } +// CHECK-LABEL: define i32 @test_vqshls_n_u32(i32 %a) #0 { +// CHECK: [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31) +// CHECK: ret i32 [[VQSHLS_N_U32]] uint32_t test_vqshls_n_u32(uint32_t a) { -// CHECK-LABEL: test_vqshls_n_u32 -// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31 return (uint32_t)vqshls_n_u32(a, 31); } +// CHECK-LABEL: define i64 @test_vqshld_n_u64(i64 %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63) +// CHECK: ret i64 [[VQSHL_N]] uint64_t test_vqshld_n_u64(uint64_t a) { -// CHECK-LABEL: test_vqshld_n_u64 -// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63 return (uint64_t)vqshld_n_u64(a, 63); } +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHL_N]]1 uint64x1_t test_vqshl_n_u64(uint64x1_t a) { -// CHECK-LABEL: test_vqshl_n_u64 -// CHECK: uqshl d{{[0-9]+}}, d{{[0-9]+}}, #1 return vqshl_n_u64(a, 1); } +// CHECK-LABEL: define i8 @test_vqshlub_n_s8(i8 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0 +// CHECK: [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqshlub_n_s8(int8_t a) { -// CHECK-LABEL: test_vqshlub_n_s8 -// CHECK: sqshlu {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7 return (int8_t)vqshlub_n_s8(a, 7); } +// CHECK-LABEL: define i16 @test_vqshluh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqshluh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqshluh_n_s16 -// CHECK: sqshlu {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15 return (int16_t)vqshluh_n_s16(a, 15); } +// CHECK-LABEL: define i32 @test_vqshlus_n_s32(i32 %a) #0 { +// CHECK: [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31) +// CHECK: ret i32 [[VQSHLUS_N_S32]] int32_t test_vqshlus_n_s32(int32_t a) { -// CHECK-LABEL: test_vqshlus_n_s32 -// CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31 return (int32_t)vqshlus_n_s32(a, 31); } +// CHECK-LABEL: define i64 @test_vqshlud_n_s64(i64 %a) #0 { +// CHECK: [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63) +// CHECK: ret i64 [[VQSHLU_N]] int64_t test_vqshlud_n_s64(int64_t a) { -// CHECK-LABEL: test_vqshlud_n_s64 -// CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63 return (int64_t)vqshlud_n_s64(a, 63); } +// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHLU_N]]1 uint64x1_t test_vqshlu_n_s64(int64x1_t a) { -// CHECK-LABEL: test_vqshlu_n_s64 -// CHECK: sqshlu d{{[0-9]+}}, d{{[0-9]+}}, #1 return vqshlu_n_s64(a, 1); } +// CHECK-LABEL: define i64 @test_vsrid_n_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> +// CHECK: [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S64]]1, i32 63) +// CHECK: [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S64]]2 to i64 +// CHECK: ret i64 [[VSRID_N_S64]]3 int64_t test_vsrid_n_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vsrid_n_s64 -// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63 return (int64_t)vsrid_n_s64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSRI_N]]2 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vsri_n_s64 -// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsri_n_s64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vsrid_n_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> +// CHECK: [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U64]]1, i32 63) +// CHECK: [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U64]]2 to i64 +// CHECK: ret i64 [[VSRID_N_U64]]3 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vsrid_n_u64 -// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63 return (uint64_t)vsrid_n_u64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSRI_N]]2 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { -// CHECK-LABEL: test_vsri_n_u64 -// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsri_n_u64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vslid_n_s64(i64 %a, i64 %b) #0 { +// CHECK: [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> +// CHECK: [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S64]]1, i32 63) +// CHECK: [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S64]]2 to i64 +// CHECK: ret i64 [[VSLID_N_S64]]3 int64_t test_vslid_n_s64(int64_t a, int64_t b) { -// CHECK-LABEL: test_vslid_n_s64 -// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63 return (int64_t)vslid_n_s64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSLI_N]]2 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { -// CHECK-LABEL: test_vsli_n_s64 -// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsli_n_s64(a, b, 1); } +// CHECK-LABEL: define i64 @test_vslid_n_u64(i64 %a, i64 %b) #0 { +// CHECK: [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> +// CHECK: [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U64]]1, i32 63) +// CHECK: [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U64]]2 to i64 +// CHECK: ret i64 [[VSLID_N_U64]]3 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) { -// CHECK-LABEL: test_vslid_n_u64 -// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63 return (uint64_t)vslid_n_u64(a, b, 63); } +// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSLI_N]]2 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { -// CHECK-LABEL: test_vsli_n_u64 -// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1 return vsli_n_u64(a, b, 1); } +// CHECK-LABEL: define i8 @test_vqshrnh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqshrnh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqshrnh_n_s16 -// CHECK: sqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (int8_t)vqshrnh_n_s16(a, 8); } +// CHECK-LABEL: define i16 @test_vqshrns_n_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqshrns_n_s32(int32_t a) { -// CHECK-LABEL: test_vqshrns_n_s32 -// CHECK: sqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (int16_t)vqshrns_n_s32(a, 16); } +// CHECK-LABEL: define i32 @test_vqshrnd_n_s64(i64 %a) #0 { +// CHECK: [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQSHRND_N_S64]] int32_t test_vqshrnd_n_s64(int64_t a) { -// CHECK-LABEL: test_vqshrnd_n_s64 -// CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 return (int32_t)vqshrnd_n_s64(a, 32); } +// CHECK-LABEL: define i8 @test_vqshrnh_n_u16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0 +// CHECK: ret i8 [[TMP1]] uint8_t test_vqshrnh_n_u16(uint16_t a) { -// CHECK-LABEL: test_vqshrnh_n_u16 -// CHECK: uqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (uint8_t)vqshrnh_n_u16(a, 8); } +// CHECK-LABEL: define i16 @test_vqshrns_n_u32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0 +// CHECK: ret i16 [[TMP1]] uint16_t test_vqshrns_n_u32(uint32_t a) { -// CHECK-LABEL: test_vqshrns_n_u32 -// CHECK: uqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (uint16_t)vqshrns_n_u32(a, 16); } +// CHECK-LABEL: define i32 @test_vqshrnd_n_u64(i64 %a) #0 { +// CHECK: [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQSHRND_N_U64]] uint32_t test_vqshrnd_n_u64(uint64_t a) { -// CHECK-LABEL: test_vqshrnd_n_u64 -// CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 return (uint32_t)vqshrnd_n_u64(a, 32); } +// CHECK-LABEL: define i8 @test_vqrshrnh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqrshrnh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqrshrnh_n_s16 -// CHECK: sqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (int8_t)vqrshrnh_n_s16(a, 8); } +// CHECK-LABEL: define i16 @test_vqrshrns_n_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqrshrns_n_s32(int32_t a) { -// CHECK-LABEL: test_vqrshrns_n_s32 -// CHECK: sqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (int16_t)vqrshrns_n_s32(a, 16); } +// CHECK-LABEL: define i32 @test_vqrshrnd_n_s64(i64 %a) #0 { +// CHECK: [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQRSHRND_N_S64]] int32_t test_vqrshrnd_n_s64(int64_t a) { -// CHECK-LABEL: test_vqrshrnd_n_s64 -// CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 return (int32_t)vqrshrnd_n_s64(a, 32); } +// CHECK-LABEL: define i8 @test_vqrshrnh_n_u16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0 +// CHECK: ret i8 [[TMP1]] uint8_t test_vqrshrnh_n_u16(uint16_t a) { -// CHECK-LABEL: test_vqrshrnh_n_u16 -// CHECK: uqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (uint8_t)vqrshrnh_n_u16(a, 8); } +// CHECK-LABEL: define i16 @test_vqrshrns_n_u32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0 +// CHECK: ret i16 [[TMP1]] uint16_t test_vqrshrns_n_u32(uint32_t a) { -// CHECK-LABEL: test_vqrshrns_n_u32 -// CHECK: uqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (uint16_t)vqrshrns_n_u32(a, 16); } +// CHECK-LABEL: define i32 @test_vqrshrnd_n_u64(i64 %a) #0 { +// CHECK: [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQRSHRND_N_U64]] uint32_t test_vqrshrnd_n_u64(uint64_t a) { -// CHECK-LABEL: test_vqrshrnd_n_u64 -// CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 return (uint32_t)vqrshrnd_n_u64(a, 32); } +// CHECK-LABEL: define i8 @test_vqshrunh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqshrunh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqshrunh_n_s16 -// CHECK: sqshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (int8_t)vqshrunh_n_s16(a, 8); } +// CHECK-LABEL: define i16 @test_vqshruns_n_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqshruns_n_s32(int32_t a) { -// CHECK-LABEL: test_vqshruns_n_s32 -// CHECK: sqshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (int16_t)vqshruns_n_s32(a, 16); } +// CHECK-LABEL: define i32 @test_vqshrund_n_s64(i64 %a) #0 { +// CHECK: [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQSHRUND_N_S64]] int32_t test_vqshrund_n_s64(int64_t a) { -// CHECK-LABEL: test_vqshrund_n_s64 -// CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32 return (int32_t)vqshrund_n_s64(a, 32); } +// CHECK-LABEL: define i8 @test_vqrshrunh_n_s16(i16 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0 +// CHECK: [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0 +// CHECK: ret i8 [[TMP1]] int8_t test_vqrshrunh_n_s16(int16_t a) { -// CHECK-LABEL: test_vqrshrunh_n_s16 -// CHECK: sqrshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8 return (int8_t)vqrshrunh_n_s16(a, 8); } +// CHECK-LABEL: define i16 @test_vqrshruns_n_s32(i32 %a) #0 { +// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0 +// CHECK: [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0 +// CHECK: ret i16 [[TMP1]] int16_t test_vqrshruns_n_s32(int32_t a) { -// CHECK-LABEL: test_vqrshruns_n_s32 -// CHECK: sqrshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16 return (int16_t)vqrshruns_n_s32(a, 16); } +// CHECK-LABEL: define i32 @test_vqrshrund_n_s64(i64 %a) #0 { +// CHECK: [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32) +// CHECK: ret i32 [[VQRSHRUND_N_S64]] int32_t test_vqrshrund_n_s64(int64_t a) { -// CHECK-LABEL: test_vqrshrund_n_s64 -// CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32 return (int32_t)vqrshrund_n_s64(a, 32); } +// CHECK-LABEL: define float @test_vcvts_n_f32_s32(i32 %a) #0 { +// CHECK: [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1) +// CHECK: ret float [[VCVTS_N_F32_S32]] float32_t test_vcvts_n_f32_s32(int32_t a) { -// CHECK-LABEL: test_vcvts_n_f32_s32 -// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1 return vcvts_n_f32_s32(a, 1); } +// CHECK-LABEL: define double @test_vcvtd_n_f64_s64(i64 %a) #0 { +// CHECK: [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1) +// CHECK: ret double [[VCVTD_N_F64_S64]] float64_t test_vcvtd_n_f64_s64(int64_t a) { -// CHECK-LABEL: test_vcvtd_n_f64_s64 -// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1 return vcvtd_n_f64_s64(a, 1); } +// CHECK-LABEL: define float @test_vcvts_n_f32_u32(i32 %a) #0 { +// CHECK: [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32) +// CHECK: ret float [[VCVTS_N_F32_U32]] float32_t test_vcvts_n_f32_u32(uint32_t a) { -// CHECK-LABEL: test_vcvts_n_f32_u32 -// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #32 return vcvts_n_f32_u32(a, 32); } +// CHECK-LABEL: define double @test_vcvtd_n_f64_u64(i64 %a) #0 { +// CHECK: [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64) +// CHECK: ret double [[VCVTD_N_F64_U64]] float64_t test_vcvtd_n_f64_u64(uint64_t a) { -// CHECK-LABEL: test_vcvtd_n_f64_u64 -// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #64 return vcvtd_n_f64_u64(a, 64); } +// CHECK-LABEL: define i32 @test_vcvts_n_s32_f32(float %a) #0 { +// CHECK: [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1) +// CHECK: ret i32 [[VCVTS_N_S32_F32]] int32_t test_vcvts_n_s32_f32(float32_t a) { -// CHECK-LABEL: test_vcvts_n_s32_f32 -// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1 return (int32_t)vcvts_n_s32_f32(a, 1); } +// CHECK-LABEL: define i64 @test_vcvtd_n_s64_f64(double %a) #0 { +// CHECK: [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1) +// CHECK: ret i64 [[VCVTD_N_S64_F64]] int64_t test_vcvtd_n_s64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtd_n_s64_f64 -// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1 return (int64_t)vcvtd_n_s64_f64(a, 1); } +// CHECK-LABEL: define i32 @test_vcvts_n_u32_f32(float %a) #0 { +// CHECK: [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32) +// CHECK: ret i32 [[VCVTS_N_U32_F32]] uint32_t test_vcvts_n_u32_f32(float32_t a) { -// CHECK-LABEL: test_vcvts_n_u32_f32 -// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32 return (uint32_t)vcvts_n_u32_f32(a, 32); } +// CHECK-LABEL: define i64 @test_vcvtd_n_u64_f64(double %a) #0 { +// CHECK: [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64) +// CHECK: ret i64 [[VCVTD_N_U64_F64]] uint64_t test_vcvtd_n_u64_f64(float64_t a) { -// CHECK-LABEL: test_vcvtd_n_u64_f64 -// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64 return (uint64_t)vcvtd_n_u64_f64(a, 64); } -// CHECK-LABEL: test_vreinterpret_s8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: test_vreinterpret_s8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: test_vreinterpret_s8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: test_vreinterpret_s8_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: test_vreinterpret_s8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: test_vreinterpret_s8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: test_vreinterpret_s8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: test_vreinterpret_s8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: test_vreinterpret_s8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: test_vreinterpret_s8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_f64(float64x1_t a) { return vreinterpret_s8_f64(a); } -// CHECK-LABEL: test_vreinterpret_s8_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: test_vreinterpret_s8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: test_vreinterpret_s8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) { return vreinterpret_s8_p64(a); } -// CHECK-LABEL: test_vreinterpret_s16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: test_vreinterpret_s16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: test_vreinterpret_s16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: test_vreinterpret_s16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: test_vreinterpret_s16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: test_vreinterpret_s16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: test_vreinterpret_s16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: test_vreinterpret_s16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: test_vreinterpret_s16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: test_vreinterpret_s16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_f64(float64x1_t a) { return vreinterpret_s16_f64(a); } -// CHECK-LABEL: test_vreinterpret_s16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: test_vreinterpret_s16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: test_vreinterpret_s16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) { return vreinterpret_s16_p64(a); } -// CHECK-LABEL: test_vreinterpret_s32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: test_vreinterpret_s32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: test_vreinterpret_s32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: test_vreinterpret_s32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: test_vreinterpret_s32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: test_vreinterpret_s32_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 { +// CHECK: ret <2 x i32> %a int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: test_vreinterpret_s32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: test_vreinterpret_s32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: test_vreinterpret_s32_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: test_vreinterpret_s32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_f64(float64x1_t a) { return vreinterpret_s32_f64(a); } -// CHECK-LABEL: test_vreinterpret_s32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: test_vreinterpret_s32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: test_vreinterpret_s32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) { return vreinterpret_s32_p64(a); } -// CHECK-LABEL: test_vreinterpret_s64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: test_vreinterpret_s64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: test_vreinterpret_s64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: test_vreinterpret_s64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: test_vreinterpret_s64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: test_vreinterpret_s64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: test_vreinterpret_s64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: test_vreinterpret_s64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: test_vreinterpret_s64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: test_vreinterpret_s64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_f64(float64x1_t a) { return vreinterpret_s64_f64(a); } -// CHECK-LABEL: test_vreinterpret_s64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: test_vreinterpret_s64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: test_vreinterpret_s64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) { return vreinterpret_s64_p64(a); } -// CHECK-LABEL: test_vreinterpret_u8_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: test_vreinterpret_u8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: test_vreinterpret_u8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: test_vreinterpret_u8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: test_vreinterpret_u8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: test_vreinterpret_u8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: test_vreinterpret_u8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: test_vreinterpret_u8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: test_vreinterpret_u8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: test_vreinterpret_u8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) { return vreinterpret_u8_f64(a); } -// CHECK-LABEL: test_vreinterpret_u8_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: test_vreinterpret_u8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: test_vreinterpret_u8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) { return vreinterpret_u8_p64(a); } -// CHECK-LABEL: test_vreinterpret_u16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: test_vreinterpret_u16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: test_vreinterpret_u16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: test_vreinterpret_u16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: test_vreinterpret_u16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: test_vreinterpret_u16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: test_vreinterpret_u16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: test_vreinterpret_u16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: test_vreinterpret_u16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: test_vreinterpret_u16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) { return vreinterpret_u16_f64(a); } -// CHECK-LABEL: test_vreinterpret_u16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: test_vreinterpret_u16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: test_vreinterpret_u16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) { return vreinterpret_u16_p64(a); } -// CHECK-LABEL: test_vreinterpret_u32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: test_vreinterpret_u32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: test_vreinterpret_u32_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 { +// CHECK: ret <2 x i32> %a uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: test_vreinterpret_u32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: test_vreinterpret_u32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: test_vreinterpret_u32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: test_vreinterpret_u32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: test_vreinterpret_u32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: test_vreinterpret_u32_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: test_vreinterpret_u32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) { return vreinterpret_u32_f64(a); } -// CHECK-LABEL: test_vreinterpret_u32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: test_vreinterpret_u32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: test_vreinterpret_u32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) { return vreinterpret_u32_p64(a); } -// CHECK-LABEL: test_vreinterpret_u64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: test_vreinterpret_u64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: test_vreinterpret_u64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: test_vreinterpret_u64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: test_vreinterpret_u64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: test_vreinterpret_u64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: test_vreinterpret_u64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: test_vreinterpret_u64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: test_vreinterpret_u64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: test_vreinterpret_u64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) { return vreinterpret_u64_f64(a); } -// CHECK-LABEL: test_vreinterpret_u64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: test_vreinterpret_u64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: test_vreinterpret_u64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) { return vreinterpret_u64_p64(a); } -// CHECK-LABEL: test_vreinterpret_f16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: test_vreinterpret_f16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: test_vreinterpret_f16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: test_vreinterpret_f16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: test_vreinterpret_f16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: test_vreinterpret_f16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: test_vreinterpret_f16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: test_vreinterpret_f16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: test_vreinterpret_f16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: test_vreinterpret_f16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_f64(float64x1_t a) { return vreinterpret_f16_f64(a); } -// CHECK-LABEL: test_vreinterpret_f16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: test_vreinterpret_f16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: test_vreinterpret_f16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) { return vreinterpret_f16_p64(a); } -// CHECK-LABEL: test_vreinterpret_f32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: test_vreinterpret_f32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: test_vreinterpret_f32_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: test_vreinterpret_f32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: test_vreinterpret_f32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: test_vreinterpret_f32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: test_vreinterpret_f32_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: test_vreinterpret_f32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: test_vreinterpret_f32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: test_vreinterpret_f32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_f64(float64x1_t a) { return vreinterpret_f32_f64(a); } -// CHECK-LABEL: test_vreinterpret_f32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: test_vreinterpret_f32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: test_vreinterpret_f32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) { return vreinterpret_f32_p64(a); } -// CHECK-LABEL: test_vreinterpret_f64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_s8(int8x8_t a) { return vreinterpret_f64_s8(a); } -// CHECK-LABEL: test_vreinterpret_f64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_s16(int16x4_t a) { return vreinterpret_f64_s16(a); } -// CHECK-LABEL: test_vreinterpret_f64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_s32(int32x2_t a) { return vreinterpret_f64_s32(a); } -// CHECK-LABEL: test_vreinterpret_f64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_s64(int64x1_t a) { return vreinterpret_f64_s64(a); } -// CHECK-LABEL: test_vreinterpret_f64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) { return vreinterpret_f64_u8(a); } -// CHECK-LABEL: test_vreinterpret_f64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) { return vreinterpret_f64_u16(a); } -// CHECK-LABEL: test_vreinterpret_f64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) { return vreinterpret_f64_u32(a); } -// CHECK-LABEL: test_vreinterpret_f64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) { return vreinterpret_f64_u64(a); } -// CHECK-LABEL: test_vreinterpret_f64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_f16(float16x4_t a) { return vreinterpret_f64_f16(a); } -// CHECK-LABEL: test_vreinterpret_f64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_f32(float32x2_t a) { return vreinterpret_f64_f32(a); } -// CHECK-LABEL: test_vreinterpret_f64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) { return vreinterpret_f64_p8(a); } -// CHECK-LABEL: test_vreinterpret_f64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) { return vreinterpret_f64_p16(a); } -// CHECK-LABEL: test_vreinterpret_f64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> +// CHECK: ret <1 x double> [[TMP0]] float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) { return vreinterpret_f64_p64(a); } -// CHECK-LABEL: test_vreinterpret_p8_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: test_vreinterpret_p8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: test_vreinterpret_p8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: test_vreinterpret_p8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: test_vreinterpret_p8_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: test_vreinterpret_p8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: test_vreinterpret_p8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: test_vreinterpret_p8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: test_vreinterpret_p8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: test_vreinterpret_p8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: test_vreinterpret_p8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) { return vreinterpret_p8_f64(a); } -// CHECK-LABEL: test_vreinterpret_p8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: test_vreinterpret_p8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) { return vreinterpret_p8_p64(a); } -// CHECK-LABEL: test_vreinterpret_p16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: test_vreinterpret_p16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: test_vreinterpret_p16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: test_vreinterpret_p16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: test_vreinterpret_p16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: test_vreinterpret_p16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: test_vreinterpret_p16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: test_vreinterpret_p16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: test_vreinterpret_p16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: test_vreinterpret_p16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: test_vreinterpret_p16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) { return vreinterpret_p16_f64(a); } -// CHECK-LABEL: test_vreinterpret_p16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: test_vreinterpret_p16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) { return vreinterpret_p16_p64(a); } -// CHECK-LABEL: test_vreinterpret_p64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) { return vreinterpret_p64_s8(a); } -// CHECK-LABEL: test_vreinterpret_p64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) { return vreinterpret_p64_s16(a); } -// CHECK-LABEL: test_vreinterpret_p64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) { return vreinterpret_p64_s32(a); } -// CHECK-LABEL: test_vreinterpret_p64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) { return vreinterpret_p64_s64(a); } -// CHECK-LABEL: test_vreinterpret_p64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) { return vreinterpret_p64_u8(a); } -// CHECK-LABEL: test_vreinterpret_p64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) { return vreinterpret_p64_u16(a); } -// CHECK-LABEL: test_vreinterpret_p64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) { return vreinterpret_p64_u32(a); } -// CHECK-LABEL: test_vreinterpret_p64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) { return vreinterpret_p64_u64(a); } -// CHECK-LABEL: test_vreinterpret_p64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) { return vreinterpret_p64_f16(a); } -// CHECK-LABEL: test_vreinterpret_p64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) { return vreinterpret_p64_f32(a); } -// CHECK-LABEL: test_vreinterpret_p64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) { return vreinterpret_p64_f64(a); } -// CHECK-LABEL: test_vreinterpret_p64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) { return vreinterpret_p64_p8(a); } -// CHECK-LABEL: test_vreinterpret_p64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) { return vreinterpret_p64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) { return vreinterpretq_s8_f64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) { return vreinterpretq_s8_p64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) { return vreinterpretq_s16_f64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) { return vreinterpretq_s16_p64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 { +// CHECK: ret <4 x i32> %a int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) { return vreinterpretq_s32_f64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) { return vreinterpretq_s32_p64(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) { return vreinterpretq_s64_f64(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) { return vreinterpretq_s64_p64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) { return vreinterpretq_u8_f64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) { return vreinterpretq_u8_p64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) { return vreinterpretq_u16_f64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) { return vreinterpretq_u16_p64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 { +// CHECK: ret <4 x i32> %a uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) { return vreinterpretq_u32_f64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) { return vreinterpretq_u32_p64(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) { return vreinterpretq_u64_f64(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) { return vreinterpretq_u64_p64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) { return vreinterpretq_f16_f64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) { return vreinterpretq_f16_p64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) { return vreinterpretq_f32_f64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) { return vreinterpretq_f32_p64(a); } -// CHECK-LABEL: test_vreinterpretq_f64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) { return vreinterpretq_f64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_f64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) { return vreinterpretq_f64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_f64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) { return vreinterpretq_f64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_f64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) { return vreinterpretq_f64_s64(a); } -// CHECK-LABEL: test_vreinterpretq_f64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) { return vreinterpretq_f64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_f64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) { return vreinterpretq_f64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_f64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) { return vreinterpretq_f64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_f64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) { return vreinterpretq_f64_u64(a); } -// CHECK-LABEL: test_vreinterpretq_f64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) { return vreinterpretq_f64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_f64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) { return vreinterpretq_f64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_f64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) { return vreinterpretq_f64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_f64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) { return vreinterpretq_f64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_f64_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) { return vreinterpretq_f64_p64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) { return vreinterpretq_p8_f64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) { return vreinterpretq_p8_p64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) { return vreinterpretq_p16_f64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_p16_p64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) { return vreinterpretq_p16_p64(a); } -// CHECK-LABEL: test_vreinterpretq_p64_s8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) { return vreinterpretq_p64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p64_s16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) { return vreinterpretq_p64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p64_s32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) { return vreinterpretq_p64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p64_s64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) { return vreinterpretq_p64_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p64_u8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) { return vreinterpretq_p64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p64_u16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) { return vreinterpretq_p64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p64_u32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) { return vreinterpretq_p64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p64_u64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) { return vreinterpretq_p64_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p64_f16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) { return vreinterpretq_p64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_p64_f32: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) { return vreinterpretq_p64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p64_f64: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) { return vreinterpretq_p64_f64(a); } -// CHECK-LABEL: test_vreinterpretq_p64_p8: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) { return vreinterpretq_p64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_p64_p16: -// CHECK-NEXT: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) { return vreinterpretq_p64_p16(a); } +// CHECK-LABEL: define float @test_vabds_f32(float %a, float %b) #0 { +// CHECK: [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4 +// CHECK: ret float [[VABDS_F32_I]] float32_t test_vabds_f32(float32_t a, float32_t b) { -// CHECK-LABEL: test_vabds_f32 -// CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} return vabds_f32(a, b); } +// CHECK-LABEL: define double @test_vabdd_f64(double %a, double %b) #0 { +// CHECK: [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4 +// CHECK: ret double [[VABDD_F64_I]] float64_t test_vabdd_f64(float64_t a, float64_t b) { -// CHECK-LABEL: test_vabdd_f64 -// CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} return vabdd_f64(a, b); } +// CHECK-LABEL: define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) #4 +// CHECK: ret <1 x i64> [[VUQADD2_I]] int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) { - // CHECK-LABEL: test_vuqadd_s64 return vuqadd_s64(a, b); - // CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) #4 +// CHECK: ret <1 x i64> [[VSQADD2_I]] uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) { - // CHECK-LABEL: test_vsqadd_u64 return vsqadd_u64(a, b); - // CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <8 x i8> @test_vsqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VSQADD_I]] uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vsqadd_u8 return vsqadd_u8(a, b); - // CHECK: usqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vsqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VSQADD_I]] uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vsqaddq_u8 return vsqaddq_u8(a, b); - // CHECK: usqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vsqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) #4 +// CHECK: ret <4 x i16> [[VSQADD2_I]] uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vsqadd_u16 return vsqadd_u16(a, b); - // CHECK: usqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vsqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) #4 +// CHECK: ret <8 x i16> [[VSQADD2_I]] uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vsqaddq_u16 return vsqaddq_u16(a, b); - // CHECK: usqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vsqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) #4 +// CHECK: ret <2 x i32> [[VSQADD2_I]] uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vsqadd_u32 return vsqadd_u32(a, b); - // CHECK: usqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vsqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) #4 +// CHECK: ret <4 x i32> [[VSQADD2_I]] uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vsqaddq_u32 return vsqaddq_u32(a, b); - // CHECK: usqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vsqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) #4 +// CHECK: ret <2 x i64> [[VSQADD2_I]] uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vsqaddq_u64 return vsqaddq_u64(a, b); - // CHECK: usqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x i64> @test_vabs_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) #4 +// CHECK: ret <1 x i64> [[VABS1_I]] int64x1_t test_vabs_s64(int64x1_t a) { - // CHECK-LABEL: test_vabs_s64 return vabs_s64(a); - // CHECK: abs d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vqabs_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) #4 +// CHECK: [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP1]] int64x1_t test_vqabs_s64(int64x1_t a) { - // CHECK-LABEL: test_vqabs_s64 return vqabs_s64(a); - // CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vqneg_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) #4 +// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP1]] int64x1_t test_vqneg_s64(int64x1_t a) { - // CHECK-LABEL: test_vqneg_s64 return vqneg_s64(a); - // CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vneg_s64(<1 x i64> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a +// CHECK: ret <1 x i64> [[SUB_I]] int64x1_t test_vneg_s64(int64x1_t a) { - // CHECK-LABEL: test_vneg_s64 return vneg_s64(a); - // CHECK: neg d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define float @test_vaddv_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VADDV_F32_I]] float32_t test_vaddv_f32(float32x2_t a) { - // CHECK-LABEL: test_vaddv_f32 return vaddv_f32(a); - // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define float @test_vaddvq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[TMP1]]) #4 +// CHECK: ret float [[VADDVQ_F32_I]] float32_t test_vaddvq_f32(float32x4_t a) { - // CHECK-LABEL: test_vaddvq_f32 return vaddvq_f32(a); - // CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define double @test_vaddvq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VADDVQ_F64_I]] float64_t test_vaddvq_f64(float64x2_t a) { - // CHECK-LABEL: test_vaddvq_f64 return vaddvq_f64(a); - // CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define float @test_vmaxv_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VMAXV_F32_I]] float32_t test_vmaxv_f32(float32x2_t a) { - // CHECK-LABEL: test_vmaxv_f32 return vmaxv_f32(a); - // CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define double @test_vmaxvq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VMAXVQ_F64_I]] float64_t test_vmaxvq_f64(float64x2_t a) { - // CHECK-LABEL: test_vmaxvq_f64 return vmaxvq_f64(a); - // CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define float @test_vminv_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VMINV_F32_I]] float32_t test_vminv_f32(float32x2_t a) { - // CHECK-LABEL: test_vminv_f32 return vminv_f32(a); - // CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define double @test_vminvq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VMINVQ_F64_I]] float64_t test_vminvq_f64(float64x2_t a) { - // CHECK-LABEL: test_vminvq_f64 return vminvq_f64(a); - // CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define double @test_vmaxnmvq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VMAXNMVQ_F64_I]] float64_t test_vmaxnmvq_f64(float64x2_t a) { - // CHECK-LABEL: test_vmaxnmvq_f64 return vmaxnmvq_f64(a); - // CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define float @test_vmaxnmv_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VMAXNMV_F32_I]] float32_t test_vmaxnmv_f32(float32x2_t a) { - // CHECK-LABEL: test_vmaxnmv_f32 return vmaxnmv_f32(a); - // CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define double @test_vminnmvq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4 +// CHECK: ret double [[VMINNMVQ_F64_I]] float64_t test_vminnmvq_f64(float64x2_t a) { - // CHECK-LABEL: test_vminnmvq_f64 return vminnmvq_f64(a); - // CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define float @test_vminnmv_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4 +// CHECK: ret float [[VMINNMV_F32_I]] float32_t test_vminnmv_f32(float32x2_t a) { - // CHECK-LABEL: test_vminnmv_f32 return vminnmv_f32(a); - // CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s } +// CHECK-LABEL: define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vpaddq_s64 return vpaddq_s64(a, b); - // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4 +// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vpaddq_u64 return vpaddq_u64(a, b); - // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define i64 @test_vpaddd_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4 +// CHECK: ret i64 [[VPADDD_U64_I]] uint64_t test_vpaddd_u64(uint64x2_t a) { - // CHECK-LABEL: test_vpaddd_u64 return vpaddd_u64(a); - // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define i64 @test_vaddvq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[TMP1]]) #4 +// CHECK: ret i64 [[VADDVQ_S64_I]] int64_t test_vaddvq_s64(int64x2_t a) { - // CHECK-LABEL: test_vaddvq_s64 return vaddvq_s64(a); - // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define i64 @test_vaddvq_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4 +// CHECK: ret i64 [[VADDVQ_U64_I]] uint64_t test_vaddvq_u64(uint64x2_t a) { - // CHECK-LABEL: test_vaddvq_u64 return vaddvq_u64(a); - // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, %b +// CHECK: ret <1 x double> [[ADD_I]] float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vadd_f64 return vadd_f64(a, b); - // CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %a, %b +// CHECK: ret <1 x double> [[MUL_I]] float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vmul_f64 return vmul_f64(a, b); - // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b +// CHECK: ret <1 x double> [[DIV_I]] float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vdiv_f64 return vdiv_f64(a, b); - // CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c +// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]] +// CHECK: ret <1 x double> [[ADD_I]] float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) { - // CHECK-LABEL: test_vmla_f64 return vmla_f64(a, b, c); - // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c +// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]] +// CHECK: ret <1 x double> [[SUB_I]] float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) { - // CHECK-LABEL: test_vmls_f64 return vmls_f64(a, b, c); - // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4 +// CHECK: ret <1 x double> [[TMP6]] float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { - // CHECK-LABEL: test_vfma_f64 return vfma_f64(a, b, c); - // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[TMP4:%.*]] = fsub <1 x double> , [[TMP3]] +// CHECK: [[FMLS_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[FMLS1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[FMLS2_I:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLS_I]], <1 x double> [[TMP4]], <1 x double> [[FMLS1_I]]) #4 +// CHECK: ret <1 x double> [[FMLS2_I]] float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) { - // CHECK-LABEL: test_vfms_f64 return vfms_f64(a, b, c); - // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, %b +// CHECK: ret <1 x double> [[SUB_I]] float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vsub_f64 return vsub_f64(a, b); - // CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) #4 +// CHECK: ret <1 x double> [[VABD2_I]] float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vabd_f64 return vabd_f64(a, b); - // CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) #4 +// CHECK: ret <1 x double> [[VMAX2_I]] float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) { -// CHECK-LABEL: test_vmax_f64 return vmax_f64(a, b); -// CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) #4 +// CHECK: ret <1 x double> [[VMIN2_I]] float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) { -// CHECK-LABEL: test_vmin_f64 return vmin_f64(a, b); -// CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) #4 +// CHECK: ret <1 x double> [[VMAXNM2_I]] float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) { -// CHECK-LABEL: test_vmaxnm_f64 return vmaxnm_f64(a, b); -// CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) #4 +// CHECK: ret <1 x double> [[VMINNM2_I]] float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) { -// CHECK-LABEL: test_vminnm_f64 return vminnm_f64(a, b); -// CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vabs_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) #4 +// CHECK: ret <1 x double> [[VABS1_I]] float64x1_t test_vabs_f64(float64x1_t a) { - // CHECK-LABEL: test_vabs_f64 return vabs_f64(a); - // CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vneg_f64(<1 x double> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <1 x double> , %a +// CHECK: ret <1 x double> [[SUB_I]] float64x1_t test_vneg_f64(float64x1_t a) { - // CHECK-LABEL: test_vneg_f64 return vneg_f64(a); - // CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fptosi <1 x double> [[TMP1]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vcvt_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvt_s64_f64 return vcvt_s64_f64(a); - // CHECK: fcvtzs {{[xd][0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fptoui <1 x double> [[TMP1]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vcvt_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvt_u64_f64 return vcvt_u64_f64(a); - // CHECK: fcvtzu {{[xd][0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTN1_I]] int64x1_t test_vcvtn_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtn_s64_f64 return vcvtn_s64_f64(a); - // CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTN1_I]] uint64x1_t test_vcvtn_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtn_u64_f64 return vcvtn_u64_f64(a); - // CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTP1_I]] int64x1_t test_vcvtp_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtp_s64_f64 return vcvtp_s64_f64(a); - // CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTP1_I]] uint64x1_t test_vcvtp_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtp_u64_f64 return vcvtp_u64_f64(a); - // CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTM1_I]] int64x1_t test_vcvtm_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtm_s64_f64 return vcvtm_s64_f64(a); - // CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTM1_I]] uint64x1_t test_vcvtm_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvtm_u64_f64 return vcvtm_u64_f64(a); - // CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTA1_I]] int64x1_t test_vcvta_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvta_s64_f64 return vcvta_s64_f64(a); - // CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4 +// CHECK: ret <1 x i64> [[VCVTA1_I]] uint64x1_t test_vcvta_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvta_u64_f64 return vcvta_u64_f64(a); - // CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double> +// CHECK: ret <1 x double> [[VCVT_I]] float64x1_t test_vcvt_f64_s64(int64x1_t a) { - // CHECK-LABEL: test_vcvt_f64_s64 return vcvt_f64_s64(a); - // CHECK: scvtf d{{[0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double> +// CHECK: ret <1 x double> [[VCVT_I]] float64x1_t test_vcvt_f64_u64(uint64x1_t a) { - // CHECK-LABEL: test_vcvt_f64_u64 return vcvt_f64_u64(a); - // CHECK: ucvtf d{{[0-9]+}}, {{[xd][0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) +// CHECK: ret <1 x i64> [[VCVT_N]]1 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvt_n_s64_f64 return vcvt_n_s64_f64(a, 64); - // CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64 } +// CHECK-LABEL: define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) +// CHECK: ret <1 x i64> [[VCVT_N]]1 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) { - // CHECK-LABEL: test_vcvt_n_u64_f64 return vcvt_n_u64_f64(a, 64); - // CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64 } +// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) +// CHECK: ret <1 x double> [[VCVT_N]]1 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) { - // CHECK-LABEL: test_vcvt_n_f64_s64 return vcvt_n_f64_s64(a, 64); - // CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 } +// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) +// CHECK: ret <1 x double> [[VCVT_N]]1 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) { - // CHECK-LABEL: test_vcvt_n_f64_u64 return vcvt_n_f64_u64(a, 64); - // CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 } +// CHECK-LABEL: define <1 x double> @test_vrndn_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> [[VRNDN_I]]) #4 +// CHECK: ret <1 x double> [[VRNDN1_I]] float64x1_t test_vrndn_f64(float64x1_t a) { - // CHECK-LABEL: test_vrndn_f64 return vrndn_f64(a); - // CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrnda_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) #4 +// CHECK: ret <1 x double> [[VRNDA1_I]] float64x1_t test_vrnda_f64(float64x1_t a) { - // CHECK-LABEL: test_vrnda_f64 return vrnda_f64(a); - // CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrndp_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) #4 +// CHECK: ret <1 x double> [[VRNDP1_I]] float64x1_t test_vrndp_f64(float64x1_t a) { - // CHECK-LABEL: test_vrndp_f64 return vrndp_f64(a); - // CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrndm_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) #4 +// CHECK: ret <1 x double> [[VRNDM1_I]] float64x1_t test_vrndm_f64(float64x1_t a) { - // CHECK-LABEL: test_vrndm_f64 return vrndm_f64(a); - // CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrndx_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) #4 +// CHECK: ret <1 x double> [[VRNDX1_I]] float64x1_t test_vrndx_f64(float64x1_t a) { - // CHECK-LABEL: test_vrndx_f64 return vrndx_f64(a); - // CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrnd_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) #4 +// CHECK: ret <1 x double> [[VRNDZ1_I]] float64x1_t test_vrnd_f64(float64x1_t a) { - // CHECK-LABEL: test_vrnd_f64 return vrnd_f64(a); - // CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrndi_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_I]]) #4 +// CHECK: ret <1 x double> [[VRNDI1_I]] float64x1_t test_vrndi_f64(float64x1_t a) { - // CHECK-LABEL: test_vrndi_f64 return vrndi_f64(a); - // CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrsqrte_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) #4 +// CHECK: ret <1 x double> [[VRSQRTE_V1_I]] float64x1_t test_vrsqrte_f64(float64x1_t a) { - // CHECK-LABEL: test_vrsqrte_f64 return vrsqrte_f64(a); - // CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrecpe_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) #4 +// CHECK: ret <1 x double> [[VRECPE_V1_I]] float64x1_t test_vrecpe_f64(float64x1_t a) { - // CHECK-LABEL: test_vrecpe_f64 return vrecpe_f64(a); - // CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vsqrt_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP1]]) #4 +// CHECK: ret <1 x double> [[VSQRT_I]] float64x1_t test_vsqrt_f64(float64x1_t a) { - // CHECK-LABEL: test_vsqrt_f64 return vsqrt_f64(a); - // CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) #4 +// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <1 x double> +// CHECK: ret <1 x double> [[TMP2]] float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vrecps_f64 return vrecps_f64(a, b); - // CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) #4 +// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <1 x double> +// CHECK: ret <1 x double> [[TMP2]] float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) { - // CHECK-LABEL: test_vrsqrts_f64 return vrsqrts_f64(a, b); - // CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} } +// CHECK-LABEL: define i32 @test_vminv_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VMINV_S32_I]] int32_t test_vminv_s32(int32x2_t a) { - // CHECK-LABEL: test_vminv_s32 return vminv_s32(a); - // CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i32 @test_vminv_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VMINV_U32_I]] uint32_t test_vminv_u32(uint32x2_t a) { - // CHECK-LABEL: test_vminv_u32 return vminv_u32(a); - // CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i32 @test_vmaxv_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VMAXV_S32_I]] int32_t test_vmaxv_s32(int32x2_t a) { - // CHECK-LABEL: test_vmaxv_s32 return vmaxv_s32(a); - // CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i32 @test_vmaxv_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VMAXV_U32_I]] uint32_t test_vmaxv_u32(uint32x2_t a) { - // CHECK-LABEL: test_vmaxv_u32 return vmaxv_u32(a); - // CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i32 @test_vaddv_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VADDV_S32_I]] int32_t test_vaddv_s32(int32x2_t a) { - // CHECK-LABEL: test_vaddv_s32 return vaddv_s32(a); - // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i32 @test_vaddv_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i32 [[VADDV_U32_I]] uint32_t test_vaddv_u32(uint32x2_t a) { - // CHECK-LABEL: test_vaddv_u32 return vaddv_u32(a); - // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i64 @test_vaddlv_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i64 [[VADDLV_S32_I]] int64_t test_vaddlv_s32(int32x2_t a) { - // CHECK-LABEL: test_vaddlv_s32 return vaddlv_s32(a); - // CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s } +// CHECK-LABEL: define i64 @test_vaddlv_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4 +// CHECK: ret i64 [[VADDLV_U32_I]] uint64_t test_vaddlv_u32(uint32x2_t a) { - // CHECK-LABEL: test_vaddlv_u32 return vaddlv_u32(a); - // CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s } Index: test/CodeGen/aarch64-neon-ldst-one.c =================================================================== --- test/CodeGen/aarch64-neon-ldst-one.c +++ test/CodeGen/aarch64-neon-ldst-one.c @@ -1,2049 +1,7977 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -fallow-half-arguments-and-returns -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] uint8x16_t test_vld1q_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld1q_dup_u8 return vld1q_dup_u8(a); - // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] uint16x8_t test_vld1q_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld1q_dup_u16 return vld1q_dup_u16(a); - // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i32> [[LANE]] uint32x4_t test_vld1q_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld1q_dup_u32 return vld1q_dup_u32(a); - // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] uint64x2_t test_vld1q_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld1q_dup_u64 return vld1q_dup_u64(a); - // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] int8x16_t test_vld1q_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld1q_dup_s8 return vld1q_dup_s8(a); - // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] int16x8_t test_vld1q_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld1q_dup_s16 return vld1q_dup_s16(a); - // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i32> [[LANE]] int32x4_t test_vld1q_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld1q_dup_s32 return vld1q_dup_s32(a); - // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] int64x2_t test_vld1q_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld1q_dup_s64 return vld1q_dup_s64(a); - // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> +// CHECK: ret <8 x half> [[TMP4]] float16x8_t test_vld1q_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld1q_dup_f16 return vld1q_dup_f16(a); - // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP2:%.*]] = load float, float* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x float> [[LANE]] float32x4_t test_vld1q_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld1q_dup_f32 return vld1q_dup_f32(a); - // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: [[TMP2:%.*]] = load double, double* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x double> [[LANE]] float64x2_t test_vld1q_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld1q_dup_f64 return vld1q_dup_f64(a); - // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] poly8x16_t test_vld1q_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld1q_dup_p8 return vld1q_dup_p8(a); - // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] poly16x8_t test_vld1q_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld1q_dup_p16 return vld1q_dup_p16(a); - // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vld1q_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld1q_dup_p64 return vld1q_dup_p64(a); - // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] uint8x8_t test_vld1_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld1_dup_u8 return vld1_dup_u8(a); - // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] uint16x4_t test_vld1_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld1_dup_u16 return vld1_dup_u16(a); - // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i32> [[LANE]] uint32x2_t test_vld1_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld1_dup_u32 return vld1_dup_u32(a); - // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] uint64x1_t test_vld1_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld1_dup_u64 return vld1_dup_u64(a); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] int8x8_t test_vld1_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld1_dup_s8 return vld1_dup_s8(a); - // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] int16x4_t test_vld1_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld1_dup_s16 return vld1_dup_s16(a); - // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i32> [[LANE]] int32x2_t test_vld1_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld1_dup_s32 return vld1_dup_s32(a); - // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] int64x1_t test_vld1_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld1_dup_s64 return vld1_dup_s64(a); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> +// CHECK: ret <4 x half> [[TMP4]] float16x4_t test_vld1_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld1_dup_f16 return vld1_dup_f16(a); - // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP2:%.*]] = load float, float* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x float> [[LANE]] float32x2_t test_vld1_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld1_dup_f32 return vld1_dup_f32(a); - // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: [[TMP2:%.*]] = load double, double* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x double> [[LANE]] float64x1_t test_vld1_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld1_dup_f64 return vld1_dup_f64(a); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] poly8x8_t test_vld1_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld1_dup_p8 return vld1_dup_p8(a); - // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] poly16x4_t test_vld1_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld1_dup_p16 return vld1_dup_p16(a); - // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] poly64x1_t test_vld1_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld1_dup_p64 return vld1_dup_p64(a); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP4]] uint8x16x2_t test_vld2q_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld2q_dup_u8 return vld2q_dup_u8(a); - // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP6]] uint16x8x2_t test_vld2q_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld2q_dup_u16 return vld2q_dup_u16(a); - // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP6]] uint32x4x2_t test_vld2q_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld2q_dup_u32 return vld2q_dup_u32(a); - // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x2_t [[TMP6]] uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld2q_dup_u64 return vld2q_dup_u64(a); - // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP4]] int8x16x2_t test_vld2q_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld2q_dup_s8 return vld2q_dup_s8(a); - // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP6]] int16x8x2_t test_vld2q_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld2q_dup_s16 return vld2q_dup_s16(a); - // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP6]] int32x4x2_t test_vld2q_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld2q_dup_s32 return vld2q_dup_s32(a); - // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x2_t [[TMP6]] int64x2x2_t test_vld2q_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld2q_dup_s64 return vld2q_dup_s64(a); - // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP6]] float16x8x2_t test_vld2q_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld2q_dup_f16 return vld2q_dup_f16(a); - // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP6]] float32x4x2_t test_vld2q_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld2q_dup_f32 return vld2q_dup_f32(a); - // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x2_t [[TMP6]] float64x2x2_t test_vld2q_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld2q_dup_f64 return vld2q_dup_f64(a); - // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP4]] poly8x16x2_t test_vld2q_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld2q_dup_p8 return vld2q_dup_p8(a); - // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP6]] poly16x8x2_t test_vld2q_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld2q_dup_p16 return vld2q_dup_p16(a); - // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x2_t [[TMP6]] poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld2q_dup_p64 return vld2q_dup_p64(a); - // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP4]] uint8x8x2_t test_vld2_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld2_dup_u8 return vld2_dup_u8(a); - // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP6]] uint16x4x2_t test_vld2_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld2_dup_u16 return vld2_dup_u16(a); - // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP6]] uint32x2x2_t test_vld2_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld2_dup_u32 return vld2_dup_u32(a); - // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP6]] uint64x1x2_t test_vld2_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld2_dup_u64 return vld2_dup_u64(a); - // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP4]] int8x8x2_t test_vld2_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld2_dup_s8 return vld2_dup_s8(a); - // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP6]] int16x4x2_t test_vld2_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld2_dup_s16 return vld2_dup_s16(a); - // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP6]] int32x2x2_t test_vld2_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld2_dup_s32 return vld2_dup_s32(a); - // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP6]] int64x1x2_t test_vld2_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld2_dup_s64 return vld2_dup_s64(a); - // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP6]] float16x4x2_t test_vld2_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld2_dup_f16 return vld2_dup_f16(a); - // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP6]] float32x2x2_t test_vld2_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld2_dup_f32 return vld2_dup_f32(a); - // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x2_t [[TMP6]] float64x1x2_t test_vld2_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld2_dup_f64 return vld2_dup_f64(a); - // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP4]] poly8x8x2_t test_vld2_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld2_dup_p8 return vld2_dup_p8(a); - // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP6]] poly16x4x2_t test_vld2_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld2_dup_p16 return vld2_dup_p16(a); - // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x2_t [[TMP6]] poly64x1x2_t test_vld2_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld2_dup_p64 return vld2_dup_p64(a); - // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x3_t [[TMP4]] uint8x16x3_t test_vld3q_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld3q_dup_u8 return vld3q_dup_u8(a); - // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP6]] uint16x8x3_t test_vld3q_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld3q_dup_u16 return vld3q_dup_u16(a); - // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP6]] uint32x4x3_t test_vld3q_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld3q_dup_u32 return vld3q_dup_u32(a); - // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x3_t [[TMP6]] uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld3q_dup_u64 return vld3q_dup_u64(a); - // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x3_t [[TMP4]] int8x16x3_t test_vld3q_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld3q_dup_s8 return vld3q_dup_s8(a); - // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP6]] int16x8x3_t test_vld3q_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld3q_dup_s16 return vld3q_dup_s16(a); - // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP6]] int32x4x3_t test_vld3q_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld3q_dup_s32 return vld3q_dup_s32(a); - // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x3_t [[TMP6]] int64x2x3_t test_vld3q_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld3q_dup_s64 return vld3q_dup_s64(a); - // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP6]] float16x8x3_t test_vld3q_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld3q_dup_f16 return vld3q_dup_f16(a); - // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP6]] float32x4x3_t test_vld3q_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld3q_dup_f32 return vld3q_dup_f32(a); - // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x3_t [[TMP6]] float64x2x3_t test_vld3q_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld3q_dup_f64 return vld3q_dup_f64(a); - // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x3_t [[TMP4]] poly8x16x3_t test_vld3q_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld3q_dup_p8 return vld3q_dup_p8(a); - // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP6]] poly16x8x3_t test_vld3q_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld3q_dup_p16 return vld3q_dup_p16(a); - // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x3_t [[TMP6]] poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld3q_dup_p64 return vld3q_dup_p64(a); - // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP4]] uint8x8x3_t test_vld3_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld3_dup_u8 return vld3_dup_u8(a); - // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP6]] uint16x4x3_t test_vld3_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld3_dup_u16 return vld3_dup_u16(a); - // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP6]] uint32x2x3_t test_vld3_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld3_dup_u32 return vld3_dup_u32(a); - // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP6]] uint64x1x3_t test_vld3_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld3_dup_u64 return vld3_dup_u64(a); - // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP4]] int8x8x3_t test_vld3_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld3_dup_s8 return vld3_dup_s8(a); - // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP6]] int16x4x3_t test_vld3_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld3_dup_s16 return vld3_dup_s16(a); - // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP6]] int32x2x3_t test_vld3_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld3_dup_s32 return vld3_dup_s32(a); - // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP6]] int64x1x3_t test_vld3_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld3_dup_s64 return vld3_dup_s64(a); - // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP6]] float16x4x3_t test_vld3_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld3_dup_f16 return vld3_dup_f16(a); - // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP6]] float32x2x3_t test_vld3_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld3_dup_f32 return vld3_dup_f32(a); - // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x3_t [[TMP6]] float64x1x3_t test_vld3_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld3_dup_f64 return vld3_dup_f64(a); - // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP4]] poly8x8x3_t test_vld3_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld3_dup_p8 return vld3_dup_p8(a); - // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP6]] poly16x4x3_t test_vld3_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld3_dup_p16 return vld3_dup_p16(a); - // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x3_t [[TMP6]] poly64x1x3_t test_vld3_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld3_dup_p64 return vld3_dup_p64(a); - // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, // [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x4_t [[TMP4]] uint8x16x4_t test_vld4q_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld4q_dup_u8 return vld4q_dup_u8(a); - // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP6]] uint16x8x4_t test_vld4q_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld4q_dup_u16 return vld4q_dup_u16(a); - // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP6]] uint32x4x4_t test_vld4q_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld4q_dup_u32 return vld4q_dup_u32(a); - // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x4_t [[TMP6]] uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld4q_dup_u64 return vld4q_dup_u64(a); - // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x4_t [[TMP4]] int8x16x4_t test_vld4q_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld4q_dup_s8 return vld4q_dup_s8(a); - // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP6]] int16x8x4_t test_vld4q_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld4q_dup_s16 return vld4q_dup_s16(a); - // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP6]] int32x4x4_t test_vld4q_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld4q_dup_s32 return vld4q_dup_s32(a); - // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x4_t [[TMP6]] int64x2x4_t test_vld4q_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld4q_dup_s64 return vld4q_dup_s64(a); - // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP6]] float16x8x4_t test_vld4q_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld4q_dup_f16 return vld4q_dup_f16(a); - // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP6]] float32x4x4_t test_vld4q_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld4q_dup_f32 return vld4q_dup_f32(a); - // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x4_t [[TMP6]] float64x2x4_t test_vld4q_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld4q_dup_f64 return vld4q_dup_f64(a); - // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x4_t [[TMP4]] poly8x16x4_t test_vld4q_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld4q_dup_p8 return vld4q_dup_p8(a); - // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP6]] poly16x8x4_t test_vld4q_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld4q_dup_p16 return vld4q_dup_p16(a); - // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x4_t [[TMP6]] poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld4q_dup_p64 return vld4q_dup_p64(a); - // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP4]] uint8x8x4_t test_vld4_dup_u8(uint8_t *a) { - // CHECK-LABEL: test_vld4_dup_u8 return vld4_dup_u8(a); - // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP6]] uint16x4x4_t test_vld4_dup_u16(uint16_t *a) { - // CHECK-LABEL: test_vld4_dup_u16 return vld4_dup_u16(a); - // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP6]] uint32x2x4_t test_vld4_dup_u32(uint32_t *a) { - // CHECK-LABEL: test_vld4_dup_u32 return vld4_dup_u32(a); - // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP6]] uint64x1x4_t test_vld4_dup_u64(uint64_t *a) { - // CHECK-LABEL: test_vld4_dup_u64 return vld4_dup_u64(a); - // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP4]] int8x8x4_t test_vld4_dup_s8(int8_t *a) { - // CHECK-LABEL: test_vld4_dup_s8 return vld4_dup_s8(a); - // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP6]] int16x4x4_t test_vld4_dup_s16(int16_t *a) { - // CHECK-LABEL: test_vld4_dup_s16 return vld4_dup_s16(a); - // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP6]] int32x2x4_t test_vld4_dup_s32(int32_t *a) { - // CHECK-LABEL: test_vld4_dup_s32 return vld4_dup_s32(a); - // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP6]] int64x1x4_t test_vld4_dup_s64(int64_t *a) { - // CHECK-LABEL: test_vld4_dup_s64 return vld4_dup_s64(a); - // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP6]] float16x4x4_t test_vld4_dup_f16(float16_t *a) { - // CHECK-LABEL: test_vld4_dup_f16 return vld4_dup_f16(a); - // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP6]] float32x2x4_t test_vld4_dup_f32(float32_t *a) { - // CHECK-LABEL: test_vld4_dup_f32 return vld4_dup_f32(a); - // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double* +// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x4_t [[TMP6]] float64x1x4_t test_vld4_dup_f64(float64_t *a) { - // CHECK-LABEL: test_vld4_dup_f64 return vld4_dup_f64(a); - // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP4]] poly8x8x4_t test_vld4_dup_p8(poly8_t *a) { - // CHECK-LABEL: test_vld4_dup_p8 return vld4_dup_p8(a); - // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP6]] poly16x4x4_t test_vld4_dup_p16(poly16_t *a) { - // CHECK-LABEL: test_vld4_dup_p16 return vld4_dup_p16(a); - // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x4_t [[TMP6]] poly64x1x4_t test_vld4_dup_p64(poly64_t *a) { - // CHECK-LABEL: test_vld4_dup_p64 return vld4_dup_p64(a); - // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] uint8x16_t test_vld1q_lane_u8(uint8_t *a, uint8x16_t b) { - // CHECK-LABEL: test_vld1q_lane_u8 return vld1q_lane_u8(a, b, 15); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] uint16x8_t test_vld1q_lane_u16(uint16_t *a, uint16x8_t b) { - // CHECK-LABEL: test_vld1q_lane_u16 return vld1q_lane_u16(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +// CHECK: ret <4 x i32> [[VLD1_LANE]] uint32x4_t test_vld1q_lane_u32(uint32_t *a, uint32x4_t b) { - // CHECK-LABEL: test_vld1q_lane_u32 return vld1q_lane_u32(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 +// CHECK: ret <2 x i64> [[VLD1_LANE]] uint64x2_t test_vld1q_lane_u64(uint64_t *a, uint64x2_t b) { - // CHECK-LABEL: test_vld1q_lane_u64 return vld1q_lane_u64(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] int8x16_t test_vld1q_lane_s8(int8_t *a, int8x16_t b) { - // CHECK-LABEL: test_vld1q_lane_s8 return vld1q_lane_s8(a, b, 15); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] int16x8_t test_vld1q_lane_s16(int16_t *a, int16x8_t b) { - // CHECK-LABEL: test_vld1q_lane_s16 return vld1q_lane_s16(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +// CHECK: ret <4 x i32> [[VLD1_LANE]] int32x4_t test_vld1q_lane_s32(int32_t *a, int32x4_t b) { - // CHECK-LABEL: test_vld1q_lane_s32 return vld1q_lane_s32(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 +// CHECK: ret <2 x i64> [[VLD1_LANE]] int64x2_t test_vld1q_lane_s64(int64_t *a, int64x2_t b) { - // CHECK-LABEL: test_vld1q_lane_s64 return vld1q_lane_s64(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half> +// CHECK: ret <8 x half> [[TMP5]] float16x8_t test_vld1q_lane_f16(float16_t *a, float16x8_t b) { - // CHECK-LABEL: test_vld1q_lane_f16 return vld1q_lane_f16(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP4:%.*]] = load float, float* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 +// CHECK: ret <4 x float> [[VLD1_LANE]] float32x4_t test_vld1q_lane_f32(float32_t *a, float32x4_t b) { - // CHECK-LABEL: test_vld1q_lane_f32 return vld1q_lane_f32(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: [[TMP4:%.*]] = load double, double* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1 +// CHECK: ret <2 x double> [[VLD1_LANE]] float64x2_t test_vld1q_lane_f64(float64_t *a, float64x2_t b) { - // CHECK-LABEL: test_vld1q_lane_f64 return vld1q_lane_f64(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] poly8x16_t test_vld1q_lane_p8(poly8_t *a, poly8x16_t b) { - // CHECK-LABEL: test_vld1q_lane_p8 return vld1q_lane_p8(a, b, 15); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] poly16x8_t test_vld1q_lane_p16(poly16_t *a, poly16x8_t b) { - // CHECK-LABEL: test_vld1q_lane_p16 return vld1q_lane_p16(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 +// CHECK: ret <2 x i64> [[VLD1_LANE]] poly64x2_t test_vld1q_lane_p64(poly64_t *a, poly64x2_t b) { - // CHECK-LABEL: test_vld1q_lane_p64 return vld1q_lane_p64(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] uint8x8_t test_vld1_lane_u8(uint8_t *a, uint8x8_t b) { - // CHECK-LABEL: test_vld1_lane_u8 return vld1_lane_u8(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] uint16x4_t test_vld1_lane_u16(uint16_t *a, uint16x4_t b) { - // CHECK-LABEL: test_vld1_lane_u16 return vld1_lane_u16(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 +// CHECK: ret <2 x i32> [[VLD1_LANE]] uint32x2_t test_vld1_lane_u32(uint32_t *a, uint32x2_t b) { - // CHECK-LABEL: test_vld1_lane_u32 return vld1_lane_u32(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 +// CHECK: ret <1 x i64> [[VLD1_LANE]] uint64x1_t test_vld1_lane_u64(uint64_t *a, uint64x1_t b) { - // CHECK-LABEL: test_vld1_lane_u64 return vld1_lane_u64(a, b, 0); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] int8x8_t test_vld1_lane_s8(int8_t *a, int8x8_t b) { - // CHECK-LABEL: test_vld1_lane_s8 return vld1_lane_s8(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] int16x4_t test_vld1_lane_s16(int16_t *a, int16x4_t b) { - // CHECK-LABEL: test_vld1_lane_s16 return vld1_lane_s16(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 +// CHECK: ret <2 x i32> [[VLD1_LANE]] int32x2_t test_vld1_lane_s32(int32_t *a, int32x2_t b) { - // CHECK-LABEL: test_vld1_lane_s32 return vld1_lane_s32(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 +// CHECK: ret <1 x i64> [[VLD1_LANE]] int64x1_t test_vld1_lane_s64(int64_t *a, int64x1_t b) { - // CHECK-LABEL: test_vld1_lane_s64 return vld1_lane_s64(a, b, 0); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half> +// CHECK: ret <4 x half> [[TMP5]] float16x4_t test_vld1_lane_f16(float16_t *a, float16x4_t b) { - // CHECK-LABEL: test_vld1_lane_f16 return vld1_lane_f16(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP4:%.*]] = load float, float* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 +// CHECK: ret <2 x float> [[VLD1_LANE]] float32x2_t test_vld1_lane_f32(float32_t *a, float32x2_t b) { - // CHECK-LABEL: test_vld1_lane_f32 return vld1_lane_f32(a, b, 1); - // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: [[TMP4:%.*]] = load double, double* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0 +// CHECK: ret <1 x double> [[VLD1_LANE]] float64x1_t test_vld1_lane_f64(float64_t *a, float64x1_t b) { - // CHECK-LABEL: test_vld1_lane_f64 return vld1_lane_f64(a, b, 0); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] poly8x8_t test_vld1_lane_p8(poly8_t *a, poly8x8_t b) { - // CHECK-LABEL: test_vld1_lane_p8 return vld1_lane_p8(a, b, 7); - // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] poly16x4_t test_vld1_lane_p16(poly16_t *a, poly16x4_t b) { - // CHECK-LABEL: test_vld1_lane_p16 return vld1_lane_p16(a, b, 3); - // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 +// CHECK: ret <1 x i64> [[VLD1_LANE]] poly64x1_t test_vld1_lane_p64(poly64_t *a, poly64x1_t b) { - // CHECK-LABEL: test_vld1_lane_p64 return vld1_lane_p64(a, b, 0); - // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) { - // CHECK-LABEL: test_vld2q_lane_s8 return vld2q_lane_s8(ptr, src, 15); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0] } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) { - // CHECK-LABEL: test_vld2q_lane_u8 return vld2q_lane_u8(ptr, src, 15); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0] } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) { - // CHECK-LABEL: test_vld2q_lane_p8 return vld2q_lane_p8(ptr, src, 15); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0] } +// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x3_t [[TMP9]] int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) { - // CHECK-LABEL: test_vld3q_lane_s8 return vld3q_lane_s8(ptr, src, 15); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0] } +// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x3_t [[TMP9]] uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) { - // CHECK-LABEL: test_vld3q_lane_u8 return vld3q_lane_u8(ptr, src, 15); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0] } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]3 uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) { - // CHECK-LABEL: test_vld2q_lane_u16 return vld2q_lane_u16(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]3 uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) { - // CHECK-LABEL: test_vld2q_lane_u32 return vld2q_lane_u32(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x2_t [[TMP1]]3 uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) { - // CHECK-LABEL: test_vld2q_lane_u64 return vld2q_lane_u64(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]3 int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) { - // CHECK-LABEL: test_vld2q_lane_s16 return vld2q_lane_s16(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]3 int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) { - // CHECK-LABEL: test_vld2q_lane_s32 return vld2q_lane_s32(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x2_t [[TMP1]]3 int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) { - // CHECK-LABEL: test_vld2q_lane_s64 return vld2q_lane_s64(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP1]]3 float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) { - // CHECK-LABEL: test_vld2q_lane_f16 return vld2q_lane_f16(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]3 float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) { - // CHECK-LABEL: test_vld2q_lane_f32 return vld2q_lane_f32(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x2_t [[TMP1]]3 float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) { - // CHECK-LABEL: test_vld2q_lane_f64 return vld2q_lane_f64(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]3 poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) { - // CHECK-LABEL: test_vld2q_lane_p16 return vld2q_lane_p16(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x2_t [[TMP1]]3 poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) { - // CHECK-LABEL: test_vld2q_lane_p64 return vld2q_lane_p64(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) { - // CHECK-LABEL: test_vld2_lane_u8 return vld2_lane_u8(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]3 uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) { - // CHECK-LABEL: test_vld2_lane_u16 return vld2_lane_u16(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]3 uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) { - // CHECK-LABEL: test_vld2_lane_u32 return vld2_lane_u32(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP1]]3 uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) { - // CHECK-LABEL: test_vld2_lane_u64 return vld2_lane_u64(a, b, 0); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) { - // CHECK-LABEL: test_vld2_lane_s8 return vld2_lane_s8(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]3 int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) { - // CHECK-LABEL: test_vld2_lane_s16 return vld2_lane_s16(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]3 int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) { - // CHECK-LABEL: test_vld2_lane_s32 return vld2_lane_s32(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP1]]3 int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) { - // CHECK-LABEL: test_vld2_lane_s64 return vld2_lane_s64(a, b, 0); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP1]]3 float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) { - // CHECK-LABEL: test_vld2_lane_f16 return vld2_lane_f16(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]3 float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) { - // CHECK-LABEL: test_vld2_lane_f32 return vld2_lane_f32(a, b, 1); - // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x2_t [[TMP1]]3 float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) { - // CHECK-LABEL: test_vld2_lane_f64 return vld2_lane_f64(a, b, 0); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) { - // CHECK-LABEL: test_vld2_lane_p8 return vld2_lane_p8(a, b, 7); - // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]3 poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) { - // CHECK-LABEL: test_vld2_lane_p16 return vld2_lane_p16(a, b, 3); - // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x2_t [[TMP1]]3 poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) { - // CHECK-LABEL: test_vld2_lane_p64 return vld2_lane_p64(a, b, 0); - // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP1]]6 uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) { - // CHECK-LABEL: test_vld3q_lane_u16 return vld3q_lane_u16(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP1]]6 uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) { - // CHECK-LABEL: test_vld3q_lane_u32 return vld3q_lane_u32(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x3_t [[TMP1]]6 uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) { - // CHECK-LABEL: test_vld3q_lane_u64 return vld3q_lane_u64(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP1]]6 int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) { - // CHECK-LABEL: test_vld3q_lane_s16 return vld3q_lane_s16(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP1]]6 int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) { - // CHECK-LABEL: test_vld3q_lane_s32 return vld3q_lane_s32(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x3_t [[TMP1]]6 int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) { - // CHECK-LABEL: test_vld3q_lane_s64 return vld3q_lane_s64(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP1]]6 float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) { - // CHECK-LABEL: test_vld3q_lane_f16 return vld3q_lane_f16(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP1]]6 float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) { - // CHECK-LABEL: test_vld3q_lane_f32 return vld3q_lane_f32(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x3_t [[TMP1]]6 float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) { - // CHECK-LABEL: test_vld3q_lane_f64 return vld3q_lane_f64(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x3_t [[TMP9]] poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) { - // CHECK-LABEL: test_vld3q_lane_p8 return vld3q_lane_p8(a, b, 15); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP1]]6 poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) { - // CHECK-LABEL: test_vld3q_lane_p16 return vld3q_lane_p16(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x3_t [[TMP1]]6 poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) { - // CHECK-LABEL: test_vld3q_lane_p64 return vld3q_lane_p64(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP9]] uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) { - // CHECK-LABEL: test_vld3_lane_u8 return vld3_lane_u8(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP1]]6 uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) { - // CHECK-LABEL: test_vld3_lane_u16 return vld3_lane_u16(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP1]]6 uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) { - // CHECK-LABEL: test_vld3_lane_u32 return vld3_lane_u32(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP1]]6 uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) { - // CHECK-LABEL: test_vld3_lane_u64 return vld3_lane_u64(a, b, 0); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP9]] int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) { - // CHECK-LABEL: test_vld3_lane_s8 return vld3_lane_s8(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP1]]6 int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) { - // CHECK-LABEL: test_vld3_lane_s16 return vld3_lane_s16(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP1]]6 int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) { - // CHECK-LABEL: test_vld3_lane_s32 return vld3_lane_s32(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP1]]6 int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) { - // CHECK-LABEL: test_vld3_lane_s64 return vld3_lane_s64(a, b, 0); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP1]]6 float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) { - // CHECK-LABEL: test_vld3_lane_f16 return vld3_lane_f16(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP1]]6 float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) { - // CHECK-LABEL: test_vld3_lane_f32 return vld3_lane_f32(a, b, 1); - // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> +// CHECK: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x3_t [[TMP1]]6 float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) { - // CHECK-LABEL: test_vld3_lane_f64 return vld3_lane_f64(a, b, 0); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP9]] poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) { - // CHECK-LABEL: test_vld3_lane_p8 return vld3_lane_p8(a, b, 7); - // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP1]]6 poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) { - // CHECK-LABEL: test_vld3_lane_p16 return vld3_lane_p16(a, b, 3); - // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x3_t [[TMP1]]6 poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) { - // CHECK-LABEL: test_vld3_lane_p64 return vld3_lane_p64(a, b, 0); - // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x4_t [[TMP1]]0 uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) { - // CHECK-LABEL: test_vld4q_lane_u8 return vld4q_lane_u8(a, b, 15); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP1]]9 uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) { - // CHECK-LABEL: test_vld4q_lane_u16 return vld4q_lane_u16(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP1]]9 uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) { - // CHECK-LABEL: test_vld4q_lane_u32 return vld4q_lane_u32(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint64x2x4_t [[TMP1]]9 uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) { - // CHECK-LABEL: test_vld4q_lane_u64 return vld4q_lane_u64(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x4_t [[TMP1]]0 int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) { - // CHECK-LABEL: test_vld4q_lane_s8 return vld4q_lane_s8(a, b, 15); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP1]]9 int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) { - // CHECK-LABEL: test_vld4q_lane_s16 return vld4q_lane_s16(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP1]]9 int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) { - // CHECK-LABEL: test_vld4q_lane_s32 return vld4q_lane_s32(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int64x2x4_t [[TMP1]]9 int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) { - // CHECK-LABEL: test_vld4q_lane_s64 return vld4q_lane_s64(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP1]]9 float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) { - // CHECK-LABEL: test_vld4q_lane_f16 return vld4q_lane_f16(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP1]]9 float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) { - // CHECK-LABEL: test_vld4q_lane_f32 return vld4q_lane_f32(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <2 x double> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x double> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, <2 x double> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* +// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float64x2x4_t [[TMP1]]9 float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) { - // CHECK-LABEL: test_vld4q_lane_f64 return vld4q_lane_f64(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x4_t [[TMP1]]0 poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) { - // CHECK-LABEL: test_vld4q_lane_p8 return vld4q_lane_p8(a, b, 15); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP1]]9 poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) { - // CHECK-LABEL: test_vld4q_lane_p16 return vld4q_lane_p16(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x4_t [[TMP1]]9 poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) { - // CHECK-LABEL: test_vld4q_lane_p64 return vld4q_lane_p64(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP1]]0 uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) { - // CHECK-LABEL: test_vld4_lane_u8 return vld4_lane_u8(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP1]]9 uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) { - // CHECK-LABEL: test_vld4_lane_u16 return vld4_lane_u16(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP1]]9 uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) { - // CHECK-LABEL: test_vld4_lane_u32 return vld4_lane_u32(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP1]]9 uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) { - // CHECK-LABEL: test_vld4_lane_u64 return vld4_lane_u64(a, b, 0); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP1]]0 int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) { - // CHECK-LABEL: test_vld4_lane_s8 return vld4_lane_s8(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP1]]9 int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) { - // CHECK-LABEL: test_vld4_lane_s16 return vld4_lane_s16(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP1]]9 int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) { - // CHECK-LABEL: test_vld4_lane_s32 return vld4_lane_s32(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP1]]9 int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) { - // CHECK-LABEL: test_vld4_lane_s64 return vld4_lane_s64(a, b, 0); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP1]]9 float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) { - // CHECK-LABEL: test_vld4_lane_f16 return vld4_lane_f16(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP1]]9 float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) { - // CHECK-LABEL: test_vld4_lane_f32 return vld4_lane_f32(a, b, 1); - // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <1 x double> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x double> +// CHECK: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, <1 x double> [[TMP1]]5, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* +// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float64x1x4_t [[TMP1]]9 float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) { - // CHECK-LABEL: test_vld4_lane_f64 return vld4_lane_f64(a, b, 0); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP1]]0 poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) { - // CHECK-LABEL: test_vld4_lane_p8 return vld4_lane_p8(a, b, 7); - // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP1]]9 poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) { - // CHECK-LABEL: test_vld4_lane_p16 return vld4_lane_p16(a, b, 3); - // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64> +// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x4_t [[TMP1]]9 poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) { - // CHECK-LABEL: test_vld4_lane_p64 return vld4_lane_p64(a, b, 0); - // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_u8(uint8_t *a, uint8x16_t b) { - // CHECK-LABEL: test_vst1q_lane_u8 vst1q_lane_u8(a, b, 15); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u16(uint16_t *a, uint16x8_t b) { - // CHECK-LABEL: test_vst1q_lane_u16 vst1q_lane_u16(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u32(uint32_t *a, uint32x4_t b) { - // CHECK-LABEL: test_vst1q_lane_u32 vst1q_lane_u32(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u64(uint64_t *a, uint64x2_t b) { - // CHECK-LABEL: test_vst1q_lane_u64 vst1q_lane_u64(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_s8(int8_t *a, int8x16_t b) { - // CHECK-LABEL: test_vst1q_lane_s8 vst1q_lane_s8(a, b, 15); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s16(int16_t *a, int16x8_t b) { - // CHECK-LABEL: test_vst1q_lane_s16 vst1q_lane_s16(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s32(int32_t *a, int32x4_t b) { - // CHECK-LABEL: test_vst1q_lane_s32 vst1q_lane_s32(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s64(int64_t *a, int64x2_t b) { - // CHECK-LABEL: test_vst1q_lane_s64 vst1q_lane_s64(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_f16(float16_t *a, float16x8_t b) { - // CHECK-LABEL: test_vst1q_lane_f16 vst1q_lane_f16(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: store float [[TMP3]], float* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_f32(float32_t *a, float32x4_t b) { - // CHECK-LABEL: test_vst1q_lane_f32 vst1q_lane_f32(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: store double [[TMP3]], double* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_f64(float64_t *a, float64x2_t b) { - // CHECK-LABEL: test_vst1q_lane_f64 vst1q_lane_f64(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_p8(poly8_t *a, poly8x16_t b) { - // CHECK-LABEL: test_vst1q_lane_p8 vst1q_lane_p8(a, b, 15); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_p16(poly16_t *a, poly16x8_t b) { - // CHECK-LABEL: test_vst1q_lane_p16 vst1q_lane_p16(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_p64(poly64_t *a, poly64x2_t b) { - // CHECK-LABEL: test_vst1q_lane_p64 vst1q_lane_p64(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_u8(uint8_t *a, uint8x8_t b) { - // CHECK-LABEL: test_vst1_lane_u8 vst1_lane_u8(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u16(uint16_t *a, uint16x4_t b) { - // CHECK-LABEL: test_vst1_lane_u16 vst1_lane_u16(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u32(uint32_t *a, uint32x2_t b) { - // CHECK-LABEL: test_vst1_lane_u32 vst1_lane_u32(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u64(uint64_t *a, uint64x1_t b) { - // CHECK-LABEL: test_vst1_lane_u64 vst1_lane_u64(a, b, 0); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_s8(int8_t *a, int8x8_t b) { - // CHECK-LABEL: test_vst1_lane_s8 vst1_lane_s8(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s16(int16_t *a, int16x4_t b) { - // CHECK-LABEL: test_vst1_lane_s16 vst1_lane_s16(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s32(int32_t *a, int32x2_t b) { - // CHECK-LABEL: test_vst1_lane_s32 vst1_lane_s32(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s64(int64_t *a, int64x1_t b) { - // CHECK-LABEL: test_vst1_lane_s64 vst1_lane_s64(a, b, 0); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_f16(float16_t *a, float16x4_t b) { - // CHECK-LABEL: test_vst1_lane_f16 vst1_lane_f16(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: store float [[TMP3]], float* [[TMP4]] +// CHECK: ret void void test_vst1_lane_f32(float32_t *a, float32x2_t b) { - // CHECK-LABEL: test_vst1_lane_f32 vst1_lane_f32(a, b, 1); - // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast double* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double* +// CHECK: store double [[TMP3]], double* [[TMP4]] +// CHECK: ret void void test_vst1_lane_f64(float64_t *a, float64x1_t b) { - // CHECK-LABEL: test_vst1_lane_f64 vst1_lane_f64(a, b, 0); - // CHECK: {{st1 { v[0-9]+.d }\[0]|str d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_p8(poly8_t *a, poly8x8_t b) { - // CHECK-LABEL: test_vst1_lane_p8 vst1_lane_p8(a, b, 7); - // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_p16(poly16_t *a, poly16x4_t b) { - // CHECK-LABEL: test_vst1_lane_p16 vst1_lane_p16(a, b, 3); - // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1_lane_p64(poly64_t *a, poly64x1_t b) { - // CHECK-LABEL: test_vst1_lane_p64 vst1_lane_p64(a, b, 0); - // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a) +// CHECK: ret void void test_vst2q_lane_u8(uint8_t *a, uint8x16x2_t b) { - // CHECK-LABEL: test_vst2q_lane_u8 vst2q_lane_u8(a, b, 15); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_u16(uint16_t *a, uint16x8x2_t b) { - // CHECK-LABEL: test_vst2q_lane_u16 vst2q_lane_u16(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_u32(uint32_t *a, uint32x4x2_t b) { - // CHECK-LABEL: test_vst2q_lane_u32 vst2q_lane_u32(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_u64(uint64_t *a, uint64x2x2_t b) { - // CHECK-LABEL: test_vst2q_lane_u64 vst2q_lane_u64(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a) +// CHECK: ret void void test_vst2q_lane_s8(int8_t *a, int8x16x2_t b) { - // CHECK-LABEL: test_vst2q_lane_s8 vst2q_lane_s8(a, b, 15); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_s16(int16_t *a, int16x8x2_t b) { - // CHECK-LABEL: test_vst2q_lane_s16 vst2q_lane_s16(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_s32(int32_t *a, int32x4x2_t b) { - // CHECK-LABEL: test_vst2q_lane_s32 vst2q_lane_s32(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_s64(int64_t *a, int64x2x2_t b) { - // CHECK-LABEL: test_vst2q_lane_s64 vst2q_lane_s64(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_f16(float16_t *a, float16x8x2_t b) { - // CHECK-LABEL: test_vst2q_lane_f16 vst2q_lane_f16(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_f32(float32_t *a, float32x4x2_t b) { - // CHECK-LABEL: test_vst2q_lane_f32 vst2q_lane_f32(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_f64(float64_t *a, float64x2x2_t b) { - // CHECK-LABEL: test_vst2q_lane_f64 vst2q_lane_f64(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a) +// CHECK: ret void void test_vst2q_lane_p8(poly8_t *a, poly8x16x2_t b) { - // CHECK-LABEL: test_vst2q_lane_p8 vst2q_lane_p8(a, b, 15); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_p16(poly16_t *a, poly16x8x2_t b) { - // CHECK-LABEL: test_vst2q_lane_p16 vst2q_lane_p16(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_p64(poly64_t *a, poly64x2x2_t b) { - // CHECK-LABEL: test_vst2q_lane_p64 vst2q_lane_p64(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_u8(uint8_t *a, uint8x8x2_t b) { - // CHECK-LABEL: test_vst2_lane_u8 vst2_lane_u8(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_u16(uint16_t *a, uint16x4x2_t b) { - // CHECK-LABEL: test_vst2_lane_u16 vst2_lane_u16(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_u32(uint32_t *a, uint32x2x2_t b) { - // CHECK-LABEL: test_vst2_lane_u32 vst2_lane_u32(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_u64(uint64_t *a, uint64x1x2_t b) { - // CHECK-LABEL: test_vst2_lane_u64 vst2_lane_u64(a, b, 0); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_s8(int8_t *a, int8x8x2_t b) { - // CHECK-LABEL: test_vst2_lane_s8 vst2_lane_s8(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_s16(int16_t *a, int16x4x2_t b) { - // CHECK-LABEL: test_vst2_lane_s16 vst2_lane_s16(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_s32(int32_t *a, int32x2x2_t b) { - // CHECK-LABEL: test_vst2_lane_s32 vst2_lane_s32(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_s64(int64_t *a, int64x1x2_t b) { - // CHECK-LABEL: test_vst2_lane_s64 vst2_lane_s64(a, b, 0); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_f16(float16_t *a, float16x4x2_t b) { - // CHECK-LABEL: test_vst2_lane_f16 vst2_lane_f16(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_f32(float32_t *a, float32x2x2_t b) { - // CHECK-LABEL: test_vst2_lane_f32 vst2_lane_f32(a, b, 1); - // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_f64(float64_t *a, float64x1x2_t b) { - // CHECK-LABEL: test_vst2_lane_f64 vst2_lane_f64(a, b, 0); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_p8(poly8_t *a, poly8x8x2_t b) { - // CHECK-LABEL: test_vst2_lane_p8 vst2_lane_p8(a, b, 7); - // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_p16(poly16_t *a, poly16x4x2_t b) { - // CHECK-LABEL: test_vst2_lane_p16 vst2_lane_p16(a, b, 3); - // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_p64(poly64_t *a, poly64x1x2_t b) { - // CHECK-LABEL: test_vst2_lane_p64 vst2_lane_p64(a, b, 0); - // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a) +// CHECK: ret void void test_vst3q_lane_u8(uint8_t *a, uint8x16x3_t b) { - // CHECK-LABEL: test_vst3q_lane_u8 vst3q_lane_u8(a, b, 15); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_u16(uint16_t *a, uint16x8x3_t b) { - // CHECK-LABEL: test_vst3q_lane_u16 vst3q_lane_u16(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_u32(uint32_t *a, uint32x4x3_t b) { - // CHECK-LABEL: test_vst3q_lane_u32 vst3q_lane_u32(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_u64(uint64_t *a, uint64x2x3_t b) { - // CHECK-LABEL: test_vst3q_lane_u64 vst3q_lane_u64(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a) +// CHECK: ret void void test_vst3q_lane_s8(int8_t *a, int8x16x3_t b) { - // CHECK-LABEL: test_vst3q_lane_s8 vst3q_lane_s8(a, b, 15); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_s16(int16_t *a, int16x8x3_t b) { - // CHECK-LABEL: test_vst3q_lane_s16 vst3q_lane_s16(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_s32(int32_t *a, int32x4x3_t b) { - // CHECK-LABEL: test_vst3q_lane_s32 vst3q_lane_s32(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_s64(int64_t *a, int64x2x3_t b) { - // CHECK-LABEL: test_vst3q_lane_s64 vst3q_lane_s64(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_f16(float16_t *a, float16x8x3_t b) { - // CHECK-LABEL: test_vst3q_lane_f16 vst3q_lane_f16(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_f32(float32_t *a, float32x4x3_t b) { - // CHECK-LABEL: test_vst3q_lane_f32 vst3q_lane_f32(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_f64(float64_t *a, float64x2x3_t b) { - // CHECK-LABEL: test_vst3q_lane_f64 vst3q_lane_f64(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a) +// CHECK: ret void void test_vst3q_lane_p8(poly8_t *a, poly8x16x3_t b) { - // CHECK-LABEL: test_vst3q_lane_p8 vst3q_lane_p8(a, b, 15); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_p16(poly16_t *a, poly16x8x3_t b) { - // CHECK-LABEL: test_vst3q_lane_p16 vst3q_lane_p16(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_p64(poly64_t *a, poly64x2x3_t b) { - // CHECK-LABEL: test_vst3q_lane_p64 vst3q_lane_p64(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_u8(uint8_t *a, uint8x8x3_t b) { - // CHECK-LABEL: test_vst3_lane_u8 vst3_lane_u8(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_u16(uint16_t *a, uint16x4x3_t b) { - // CHECK-LABEL: test_vst3_lane_u16 vst3_lane_u16(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_u32(uint32_t *a, uint32x2x3_t b) { - // CHECK-LABEL: test_vst3_lane_u32 vst3_lane_u32(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_u64(uint64_t *a, uint64x1x3_t b) { - // CHECK-LABEL: test_vst3_lane_u64 vst3_lane_u64(a, b, 0); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_s8(int8_t *a, int8x8x3_t b) { - // CHECK-LABEL: test_vst3_lane_s8 vst3_lane_s8(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_s16(int16_t *a, int16x4x3_t b) { - // CHECK-LABEL: test_vst3_lane_s16 vst3_lane_s16(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_s32(int32_t *a, int32x2x3_t b) { - // CHECK-LABEL: test_vst3_lane_s32 vst3_lane_s32(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_s64(int64_t *a, int64x1x3_t b) { - // CHECK-LABEL: test_vst3_lane_s64 vst3_lane_s64(a, b, 0); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_f16(float16_t *a, float16x4x3_t b) { - // CHECK-LABEL: test_vst3_lane_f16 vst3_lane_f16(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_f32(float32_t *a, float32x2x3_t b) { - // CHECK-LABEL: test_vst3_lane_f32 vst3_lane_f32(a, b, 1); - // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_f64(float64_t *a, float64x1x3_t b) { - // CHECK-LABEL: test_vst3_lane_f64 vst3_lane_f64(a, b, 0); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_p8(poly8_t *a, poly8x8x3_t b) { - // CHECK-LABEL: test_vst3_lane_p8 vst3_lane_p8(a, b, 7); - // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_p16(poly16_t *a, poly16x4x3_t b) { - // CHECK-LABEL: test_vst3_lane_p16 vst3_lane_p16(a, b, 3); - // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_p64(poly64_t *a, poly64x1x3_t b) { - // CHECK-LABEL: test_vst3_lane_p64 vst3_lane_p64(a, b, 0); - // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a) +// CHECK: ret void void test_vst4q_lane_u8(uint8_t *a, uint8x16x4_t b) { - // CHECK-LABEL: test_vst4q_lane_u8 vst4q_lane_u8(a, b, 15); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_u16(uint16_t *a, uint16x8x4_t b) { - // CHECK-LABEL: test_vst4q_lane_u16 vst4q_lane_u16(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_u32(uint32_t *a, uint32x4x4_t b) { - // CHECK-LABEL: test_vst4q_lane_u32 vst4q_lane_u32(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_u64(uint64_t *a, uint64x2x4_t b) { - // CHECK-LABEL: test_vst4q_lane_u64 vst4q_lane_u64(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a) +// CHECK: ret void void test_vst4q_lane_s8(int8_t *a, int8x16x4_t b) { - // CHECK-LABEL: test_vst4q_lane_s8 vst4q_lane_s8(a, b, 15); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_s16(int16_t *a, int16x8x4_t b) { - // CHECK-LABEL: test_vst4q_lane_s16 vst4q_lane_s16(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_s32(int32_t *a, int32x4x4_t b) { - // CHECK-LABEL: test_vst4q_lane_s32 vst4q_lane_s32(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_s64(int64_t *a, int64x2x4_t b) { - // CHECK-LABEL: test_vst4q_lane_s64 vst4q_lane_s64(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_f16(float16_t *a, float16x8x4_t b) { - // CHECK-LABEL: test_vst4q_lane_f16 vst4q_lane_f16(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_f32(float32_t *a, float32x4x4_t b) { - // CHECK-LABEL: test_vst4q_lane_f32 vst4q_lane_f32(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_f64(float64_t *a, float64x2x4_t b) { - // CHECK-LABEL: test_vst4q_lane_f64 vst4q_lane_f64(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a) +// CHECK: ret void void test_vst4q_lane_p8(poly8_t *a, poly8x16x4_t b) { - // CHECK-LABEL: test_vst4q_lane_p8 vst4q_lane_p8(a, b, 15); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_p16(poly16_t *a, poly16x8x4_t b) { - // CHECK-LABEL: test_vst4q_lane_p16 vst4q_lane_p16(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_p64(poly64_t *a, poly64x2x4_t b) { - // CHECK-LABEL: test_vst4q_lane_p64 vst4q_lane_p64(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_u8(uint8_t *a, uint8x8x4_t b) { - // CHECK-LABEL: test_vst4_lane_u8 vst4_lane_u8(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_u16(uint16_t *a, uint16x4x4_t b) { - // CHECK-LABEL: test_vst4_lane_u16 vst4_lane_u16(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_u32(uint32_t *a, uint32x2x4_t b) { - // CHECK-LABEL: test_vst4_lane_u32 vst4_lane_u32(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_u64(uint64_t *a, uint64x1x4_t b) { - // CHECK-LABEL: test_vst4_lane_u64 vst4_lane_u64(a, b, 0); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_s8(int8_t *a, int8x8x4_t b) { - // CHECK-LABEL: test_vst4_lane_s8 vst4_lane_s8(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_s16(int16_t *a, int16x4x4_t b) { - // CHECK-LABEL: test_vst4_lane_s16 vst4_lane_s16(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_s32(int32_t *a, int32x2x4_t b) { - // CHECK-LABEL: test_vst4_lane_s32 vst4_lane_s32(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_s64(int64_t *a, int64x1x4_t b) { - // CHECK-LABEL: test_vst4_lane_s64 vst4_lane_s64(a, b, 0); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_f16(float16_t *a, float16x4x4_t b) { - // CHECK-LABEL: test_vst4_lane_f16 vst4_lane_f16(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_f32(float32_t *a, float32x2x4_t b) { - // CHECK-LABEL: test_vst4_lane_f32 vst4_lane_f32(a, b, 1); - // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double> +// CHECK: call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_f64(float64_t *a, float64x1x4_t b) { - // CHECK-LABEL: test_vst4_lane_f64 vst4_lane_f64(a, b, 0); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_p8(poly8_t *a, poly8x8x4_t b) { - // CHECK-LABEL: test_vst4_lane_p8 vst4_lane_p8(a, b, 7); - // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_p16(poly16_t *a, poly16x4x4_t b) { - // CHECK-LABEL: test_vst4_lane_p16 vst4_lane_p16(a, b, 3); - // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_p64(poly64_t *a, poly64x1x4_t b) { - // CHECK-LABEL: test_vst4_lane_p64 vst4_lane_p64(a, b, 0); - // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}] } Index: test/CodeGen/aarch64-neon-misc.c =================================================================== --- test/CodeGen/aarch64-neon-misc.c +++ test/CodeGen/aarch64-neon-misc.c @@ -1,2041 +1,3068 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -fallow-half-arguments-and-returns -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include -// CHECK-LABEL: test_vceqz_s8 -// CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vceqz_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCEQZ_I]] uint8x8_t test_vceqz_s8(int8x8_t a) { return vceqz_s8(a); } -// CHECK-LABEL: test_vceqz_s16 -// CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vceqz_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCEQZ_I]] uint16x4_t test_vceqz_s16(int16x4_t a) { return vceqz_s16(a); } -// CHECK-LABEL: test_vceqz_s32 -// CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}} +// CHECK-LABEL: define <2 x i32> @test_vceqz_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCEQZ_I]] uint32x2_t test_vceqz_s32(int32x2_t a) { return vceqz_s32(a); } -// CHECK-LABEL: test_vceqz_s64 -// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCEQZ_I]] uint64x1_t test_vceqz_s64(int64x1_t a) { return vceqz_s64(a); } -// CHECK-LABEL: test_vceqz_u64 -// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCEQZ_I]] uint64x1_t test_vceqz_u64(uint64x1_t a) { return vceqz_u64(a); } -// CHECK-LABEL: test_vceqz_p64 -// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCEQZ_I]] uint64x1_t test_vceqz_p64(poly64x1_t a) { return vceqz_p64(a); } -// CHECK-LABEL: test_vceqzq_s8 -// CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vceqzq_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCEQZ_I]] uint8x16_t test_vceqzq_s8(int8x16_t a) { return vceqzq_s8(a); } -// CHECK-LABEL: test_vceqzq_s16 -// CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vceqzq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCEQZ_I]] uint16x8_t test_vceqzq_s16(int16x8_t a) { return vceqzq_s16(a); } -// CHECK-LABEL: test_vceqzq_s32 -// CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}} +// CHECK-LABEL: define <4 x i32> @test_vceqzq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCEQZ_I]] uint32x4_t test_vceqzq_s32(int32x4_t a) { return vceqzq_s32(a); } -// CHECK-LABEL: test_vceqzq_s64 -// CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}} +// CHECK-LABEL: define <2 x i64> @test_vceqzq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCEQZ_I]] uint64x2_t test_vceqzq_s64(int64x2_t a) { return vceqzq_s64(a); } -// CHECK-LABEL: test_vceqz_u8 -// CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vceqz_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCEQZ_I]] uint8x8_t test_vceqz_u8(uint8x8_t a) { return vceqz_u8(a); } -// CHECK-LABEL: test_vceqz_u16 -// CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vceqz_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCEQZ_I]] uint16x4_t test_vceqz_u16(uint16x4_t a) { return vceqz_u16(a); } -// CHECK-LABEL: test_vceqz_u32 -// CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}} +// CHECK-LABEL: define <2 x i32> @test_vceqz_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCEQZ_I]] uint32x2_t test_vceqz_u32(uint32x2_t a) { return vceqz_u32(a); } -// CHECK-LABEL: test_vceqzq_u8 -// CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vceqzq_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCEQZ_I]] uint8x16_t test_vceqzq_u8(uint8x16_t a) { return vceqzq_u8(a); } -// CHECK-LABEL: test_vceqzq_u16 -// CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vceqzq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCEQZ_I]] uint16x8_t test_vceqzq_u16(uint16x8_t a) { return vceqzq_u16(a); } -// CHECK-LABEL: test_vceqzq_u32 -// CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}} +// CHECK-LABEL: define <4 x i32> @test_vceqzq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCEQZ_I]] uint32x4_t test_vceqzq_u32(uint32x4_t a) { return vceqzq_u32(a); } -// CHECK-LABEL: test_vceqzq_u64 -// CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}} +// CHECK-LABEL: define <2 x i64> @test_vceqzq_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCEQZ_I]] uint64x2_t test_vceqzq_u64(uint64x2_t a) { return vceqzq_u64(a); } -// CHECK-LABEL: test_vceqz_f32 -// CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +// CHECK-LABEL: define <2 x i32> @test_vceqz_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fcmp oeq <2 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCEQZ_I]] uint32x2_t test_vceqz_f32(float32x2_t a) { return vceqz_f32(a); } -// CHECK-LABEL: test_vceqz_f64 -// CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, #0 +// CHECK-LABEL: define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fcmp oeq <1 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCEQZ_I]] uint64x1_t test_vceqz_f64(float64x1_t a) { return vceqz_f64(a); } -// CHECK-LABEL: test_vceqzq_f32 -// CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +// CHECK-LABEL: define <4 x i32> @test_vceqzq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCEQZ_I]] uint32x4_t test_vceqzq_f32(float32x4_t a) { return vceqzq_f32(a); } -// CHECK-LABEL: test_vceqz_p8 -// CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vceqz_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCEQZ_I]] uint8x8_t test_vceqz_p8(poly8x8_t a) { return vceqz_p8(a); } -// CHECK-LABEL: test_vceqzq_p8 -// CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vceqzq_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCEQZ_I]] uint8x16_t test_vceqzq_p8(poly8x16_t a) { return vceqzq_p8(a); } -// CHECK-LABEL: test_vceqz_p16 -// CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vceqz_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCEQZ_I]] uint16x4_t test_vceqz_p16(poly16x4_t a) { return vceqz_p16(a); } -// CHECK-LABEL: test_vceqzq_p16 -// CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vceqzq_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCEQZ_I]] uint16x8_t test_vceqzq_p16(poly16x8_t a) { return vceqzq_p16(a); } -// CHECK-LABEL: test_vceqzq_f64 -// CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vceqzq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCEQZ_I]] uint64x2_t test_vceqzq_f64(float64x2_t a) { return vceqzq_f64(a); } -// CHECK-LABEL: test_vceqzq_p64 -// CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCEQZ_I]] uint64x2_t test_vceqzq_p64(poly64x2_t a) { return vceqzq_p64(a); } -// CHECK-LABEL: test_vcgez_s8 -// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vcgez_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCGEZ_I]] uint8x8_t test_vcgez_s8(int8x8_t a) { return vcgez_s8(a); } -// CHECK-LABEL: test_vcgez_s16 -// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vcgez_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCGEZ_I]] uint16x4_t test_vcgez_s16(int16x4_t a) { return vcgez_s16(a); } -// CHECK-LABEL: test_vcgez_s32 -// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}} +// CHECK-LABEL: define <2 x i32> @test_vcgez_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCGEZ_I]] uint32x2_t test_vcgez_s32(int32x2_t a) { return vcgez_s32(a); } -// CHECK-LABEL: test_vcgez_s64 -// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCGEZ_I]] uint64x1_t test_vcgez_s64(int64x1_t a) { return vcgez_s64(a); } -// CHECK-LABEL: test_vcgezq_s8 -// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vcgezq_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCGEZ_I]] uint8x16_t test_vcgezq_s8(int8x16_t a) { return vcgezq_s8(a); } -// CHECK-LABEL: test_vcgezq_s16 -// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vcgezq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCGEZ_I]] uint16x8_t test_vcgezq_s16(int16x8_t a) { return vcgezq_s16(a); } -// CHECK-LABEL: test_vcgezq_s32 -// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}} +// CHECK-LABEL: define <4 x i32> @test_vcgezq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCGEZ_I]] uint32x4_t test_vcgezq_s32(int32x4_t a) { return vcgezq_s32(a); } -// CHECK-LABEL: test_vcgezq_s64 -// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}} +// CHECK-LABEL: define <2 x i64> @test_vcgezq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCGEZ_I]] uint64x2_t test_vcgezq_s64(int64x2_t a) { return vcgezq_s64(a); } -// CHECK-LABEL: test_vcgez_f32 -// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +// CHECK-LABEL: define <2 x i32> @test_vcgez_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fcmp oge <2 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCGEZ_I]] uint32x2_t test_vcgez_f32(float32x2_t a) { return vcgez_f32(a); } -// CHECK-LABEL: test_vcgez_f64 -// CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, #0 +// CHECK-LABEL: define <1 x i64> @test_vcgez_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fcmp oge <1 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCGEZ_I]] uint64x1_t test_vcgez_f64(float64x1_t a) { return vcgez_f64(a); } -// CHECK-LABEL: test_vcgezq_f32 -// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +// CHECK-LABEL: define <4 x i32> @test_vcgezq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCGEZ_I]] uint32x4_t test_vcgezq_f32(float32x4_t a) { return vcgezq_f32(a); } -// CHECK-LABEL: test_vcgezq_f64 -// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vcgezq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCGEZ_I]] uint64x2_t test_vcgezq_f64(float64x2_t a) { return vcgezq_f64(a); } -// CHECK-LABEL: test_vclez_s8 -// CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vclez_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCLEZ_I]] uint8x8_t test_vclez_s8(int8x8_t a) { return vclez_s8(a); } -// CHECK-LABEL: test_vclez_s16 -// CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vclez_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCLEZ_I]] uint16x4_t test_vclez_s16(int16x4_t a) { return vclez_s16(a); } -// CHECK-LABEL: test_vclez_s32 -// CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}} +// CHECK-LABEL: define <2 x i32> @test_vclez_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCLEZ_I]] uint32x2_t test_vclez_s32(int32x2_t a) { return vclez_s32(a); } -// CHECK-LABEL: test_vclez_s64 -// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCLEZ_I]] uint64x1_t test_vclez_s64(int64x1_t a) { return vclez_s64(a); } -// CHECK-LABEL: test_vclezq_s8 -// CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vclezq_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCLEZ_I]] uint8x16_t test_vclezq_s8(int8x16_t a) { return vclezq_s8(a); } -// CHECK-LABEL: test_vclezq_s16 -// CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vclezq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCLEZ_I]] uint16x8_t test_vclezq_s16(int16x8_t a) { return vclezq_s16(a); } -// CHECK-LABEL: test_vclezq_s32 -// CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}} +// CHECK-LABEL: define <4 x i32> @test_vclezq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCLEZ_I]] uint32x4_t test_vclezq_s32(int32x4_t a) { return vclezq_s32(a); } -// CHECK-LABEL: test_vclezq_s64 -// CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}} +// CHECK-LABEL: define <2 x i64> @test_vclezq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCLEZ_I]] uint64x2_t test_vclezq_s64(int64x2_t a) { return vclezq_s64(a); } -// CHECK-LABEL: test_vclez_f32 -// CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +// CHECK-LABEL: define <2 x i32> @test_vclez_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fcmp ole <2 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCLEZ_I]] uint32x2_t test_vclez_f32(float32x2_t a) { return vclez_f32(a); } -// CHECK-LABEL: test_vclez_f64 -// CHECK: fcmle {{d[0-9]+}}, {{d[0-9]+}}, #0 +// CHECK-LABEL: define <1 x i64> @test_vclez_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fcmp ole <1 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCLEZ_I]] uint64x1_t test_vclez_f64(float64x1_t a) { return vclez_f64(a); } -// CHECK-LABEL: test_vclezq_f32 -// CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +// CHECK-LABEL: define <4 x i32> @test_vclezq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCLEZ_I]] uint32x4_t test_vclezq_f32(float32x4_t a) { return vclezq_f32(a); } -// CHECK-LABEL: test_vclezq_f64 -// CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vclezq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCLEZ_I]] uint64x2_t test_vclezq_f64(float64x2_t a) { return vclezq_f64(a); } -// CHECK-LABEL: test_vcgtz_s8 -// CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}} +// CHECK-LABEL: define <8 x i8> @test_vcgtz_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCGTZ_I]] uint8x8_t test_vcgtz_s8(int8x8_t a) { return vcgtz_s8(a); } -// CHECK-LABEL: test_vcgtz_s16 -// CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}} +// CHECK-LABEL: define <4 x i16> @test_vcgtz_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCGTZ_I]] uint16x4_t test_vcgtz_s16(int16x4_t a) { return vcgtz_s16(a); } -// CHECK-LABEL: test_vcgtz_s32 -// CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}} +// CHECK-LABEL: define <2 x i32> @test_vcgtz_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCGTZ_I]] uint32x2_t test_vcgtz_s32(int32x2_t a) { return vcgtz_s32(a); } -// CHECK-LABEL: test_vcgtz_s64 -// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}} +// CHECK-LABEL: define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCGTZ_I]] uint64x1_t test_vcgtz_s64(int64x1_t a) { return vcgtz_s64(a); } -// CHECK-LABEL: test_vcgtzq_s8 -// CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}} +// CHECK-LABEL: define <16 x i8> @test_vcgtzq_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCGTZ_I]] uint8x16_t test_vcgtzq_s8(int8x16_t a) { return vcgtzq_s8(a); } -// CHECK-LABEL: test_vcgtzq_s16 -// CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}} +// CHECK-LABEL: define <8 x i16> @test_vcgtzq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCGTZ_I]] uint16x8_t test_vcgtzq_s16(int16x8_t a) { return vcgtzq_s16(a); } -// CHECK-LABEL: test_vcgtzq_s32 -// CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}} +// CHECK-LABEL: define <4 x i32> @test_vcgtzq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCGTZ_I]] uint32x4_t test_vcgtzq_s32(int32x4_t a) { return vcgtzq_s32(a); } -// CHECK-LABEL: test_vcgtzq_s64 -// CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}} +// CHECK-LABEL: define <2 x i64> @test_vcgtzq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCGTZ_I]] uint64x2_t test_vcgtzq_s64(int64x2_t a) { return vcgtzq_s64(a); } -// CHECK-LABEL: test_vcgtz_f32 -// CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +// CHECK-LABEL: define <2 x i32> @test_vcgtz_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCGTZ_I]] uint32x2_t test_vcgtz_f32(float32x2_t a) { return vcgtz_f32(a); } -// CHECK-LABEL: test_vcgtz_f64 -// CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, #0 +// CHECK-LABEL: define <1 x i64> @test_vcgtz_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fcmp ogt <1 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCGTZ_I]] uint64x1_t test_vcgtz_f64(float64x1_t a) { return vcgtz_f64(a); } -// CHECK-LABEL: test_vcgtzq_f32 -// CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +// CHECK-LABEL: define <4 x i32> @test_vcgtzq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCGTZ_I]] uint32x4_t test_vcgtzq_f32(float32x4_t a) { return vcgtzq_f32(a); } -// CHECK-LABEL: test_vcgtzq_f64 -// CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vcgtzq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCGTZ_I]] uint64x2_t test_vcgtzq_f64(float64x2_t a) { return vcgtzq_f64(a); } -// CHECK-LABEL: test_vcltz_s8 -// CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7 +// CHECK-LABEL: define <8 x i8> @test_vcltz_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: ret <8 x i8> [[VCLTZ_I]] uint8x8_t test_vcltz_s8(int8x8_t a) { return vcltz_s8(a); } -// CHECK-LABEL: test_vcltz_s16 -// CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 +// CHECK-LABEL: define <4 x i16> @test_vcltz_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VCLTZ_I]] uint16x4_t test_vcltz_s16(int16x4_t a) { return vcltz_s16(a); } -// CHECK-LABEL: test_vcltz_s32 -// CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 +// CHECK-LABEL: define <2 x i32> @test_vcltz_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCLTZ_I]] uint32x2_t test_vcltz_s32(int32x2_t a) { return vcltz_s32(a); } -// CHECK-LABEL: test_vcltz_s64 -// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63 +// CHECK-LABEL: define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCLTZ_I]] uint64x1_t test_vcltz_s64(int64x1_t a) { return vcltz_s64(a); } -// CHECK-LABEL: test_vcltzq_s8 -// CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7 +// CHECK-LABEL: define <16 x i8> @test_vcltzq_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK: ret <16 x i8> [[VCLTZ_I]] uint8x16_t test_vcltzq_s8(int8x16_t a) { return vcltzq_s8(a); } -// CHECK-LABEL: test_vcltzq_s16 -// CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 +// CHECK-LABEL: define <8 x i16> @test_vcltzq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK: ret <8 x i16> [[VCLTZ_I]] uint16x8_t test_vcltzq_s16(int16x8_t a) { return vcltzq_s16(a); } -// CHECK-LABEL: test_vcltzq_s32 -// CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 +// CHECK-LABEL: define <4 x i32> @test_vcltzq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCLTZ_I]] uint32x4_t test_vcltzq_s32(int32x4_t a) { return vcltzq_s32(a); } -// CHECK-LABEL: test_vcltzq_s64 -// CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63 +// CHECK-LABEL: define <2 x i64> @test_vcltzq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCLTZ_I]] uint64x2_t test_vcltzq_s64(int64x2_t a) { return vcltzq_s64(a); } -// CHECK-LABEL: test_vcltz_f32 -// CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +// CHECK-LABEL: define <2 x i32> @test_vcltz_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fcmp olt <2 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VCLTZ_I]] uint32x2_t test_vcltz_f32(float32x2_t a) { return vcltz_f32(a); } -// CHECK-LABEL: test_vcltz_f64 -// CHECK: fcmlt {{d[0-9]+}}, {{d[0-9]+}}, #0 +// CHECK-LABEL: define <1 x i64> @test_vcltz_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = fcmp olt <1 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK: ret <1 x i64> [[VCLTZ_I]] uint64x1_t test_vcltz_f64(float64x1_t a) { return vcltz_f64(a); } -// CHECK-LABEL: test_vcltzq_f32 -// CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +// CHECK-LABEL: define <4 x i32> @test_vcltzq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK: ret <4 x i32> [[VCLTZ_I]] uint32x4_t test_vcltzq_f32(float32x4_t a) { return vcltzq_f32(a); } -// CHECK-LABEL: test_vcltzq_f64 -// CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +// CHECK-LABEL: define <2 x i64> @test_vcltzq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP1]], zeroinitializer +// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK: ret <2 x i64> [[VCLTZ_I]] uint64x2_t test_vcltzq_f64(float64x2_t a) { return vcltzq_f64(a); } -// CHECK-LABEL: test_vrev16_s8 -// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: test_vrev16_u8 -// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: test_vrev16_p8 -// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: test_vrev16q_s8 -// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: test_vrev16q_u8 -// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: test_vrev16q_p8 -// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: test_vrev32_s8 -// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: test_vrev32_s16 -// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: test_vrev32_u8 -// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: test_vrev32_u16 -// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: test_vrev32_p8 -// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: test_vrev32_p16 -// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: test_vrev32q_s8 -// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: test_vrev32q_s16 -// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: test_vrev32q_u8 -// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: test_vrev32q_u16 -// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: test_vrev32q_p8 -// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: test_vrev32q_p16 -// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: test_vrev64_s8 -// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: test_vrev64_s16 -// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: test_vrev64_s32 -// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: test_vrev64_u8 -// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: test_vrev64_u16 -// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: test_vrev64_u32 -// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: test_vrev64_p8 -// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: test_vrev64_p16 -// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: test_vrev64_f32 -// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: test_vrev64q_s8 -// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: test_vrev64q_s16 -// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: test_vrev64q_s32 -// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: test_vrev64q_u8 -// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: test_vrev64q_u16 -// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: test_vrev64q_u32 -// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: test_vrev64q_p8 -// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: test_vrev64q_p16 -// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: test_vrev64q_f32 -// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } +// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2 +// CHECK: ret <4 x i16> [[VPADDL_I]] int16x4_t test_vpaddl_s8(int8x8_t a) { - // CHECK-LABEL: test_vpaddl_s8 return vpaddl_s8(a); - // CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2 +// CHECK: ret <2 x i32> [[VPADDL1_I]] int32x2_t test_vpaddl_s16(int16x4_t a) { - // CHECK-LABEL: test_vpaddl_s16 return vpaddl_s16(a); - // CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2 +// CHECK: ret <1 x i64> [[VPADDL1_I]] int64x1_t test_vpaddl_s32(int32x2_t a) { - // CHECK-LABEL: test_vpaddl_s32 return vpaddl_s32(a); - // CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2 +// CHECK: ret <4 x i16> [[VPADDL_I]] uint16x4_t test_vpaddl_u8(uint8x8_t a) { - // CHECK-LABEL: test_vpaddl_u8 return vpaddl_u8(a); - // CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2 +// CHECK: ret <2 x i32> [[VPADDL1_I]] uint32x2_t test_vpaddl_u16(uint16x4_t a) { - // CHECK-LABEL: test_vpaddl_u16 return vpaddl_u16(a); - // CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2 +// CHECK: ret <1 x i64> [[VPADDL1_I]] uint64x1_t test_vpaddl_u32(uint32x2_t a) { - // CHECK-LABEL: test_vpaddl_u32 return vpaddl_u32(a); - // CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2 +// CHECK: ret <8 x i16> [[VPADDL_I]] int16x8_t test_vpaddlq_s8(int8x16_t a) { - // CHECK-LABEL: test_vpaddlq_s8 return vpaddlq_s8(a); - // CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2 +// CHECK: ret <4 x i32> [[VPADDL1_I]] int32x4_t test_vpaddlq_s16(int16x8_t a) { - // CHECK-LABEL: test_vpaddlq_s16 return vpaddlq_s16(a); - // CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2 +// CHECK: ret <2 x i64> [[VPADDL1_I]] int64x2_t test_vpaddlq_s32(int32x4_t a) { - // CHECK-LABEL: test_vpaddlq_s32 return vpaddlq_s32(a); - // CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2 +// CHECK: ret <8 x i16> [[VPADDL_I]] uint16x8_t test_vpaddlq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vpaddlq_u8 return vpaddlq_u8(a); - // CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2 +// CHECK: ret <4 x i32> [[VPADDL1_I]] uint32x4_t test_vpaddlq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vpaddlq_u16 return vpaddlq_u16(a); - // CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2 +// CHECK: ret <2 x i64> [[VPADDL1_I]] uint64x2_t test_vpaddlq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vpaddlq_u32 return vpaddlq_u32(a); - // CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { - // CHECK-LABEL: test_vpadal_s8 return vpadal_s8(a, b); - // CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i32> [[TMP3]] int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { - // CHECK-LABEL: test_vpadal_s16 return vpadal_s16(a, b); - // CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <1 x i64> [[TMP3]] int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { - // CHECK-LABEL: test_vpadal_s32 return vpadal_s32(a, b); - // CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { - // CHECK-LABEL: test_vpadal_u8 return vpadal_u8(a, b); - // CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i32> [[TMP3]] uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { - // CHECK-LABEL: test_vpadal_u16 return vpadal_u16(a, b); - // CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <1 x i64> [[TMP3]] uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { - // CHECK-LABEL: test_vpadal_u32 return vpadal_u32(a, b); - // CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2 +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { - // CHECK-LABEL: test_vpadalq_s8 return vpadalq_s8(a, b); - // CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <4 x i32> [[TMP3]] int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { - // CHECK-LABEL: test_vpadalq_s16 return vpadalq_s16(a, b); - // CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i64> [[TMP3]] int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { - // CHECK-LABEL: test_vpadalq_s32 return vpadalq_s32(a, b); - // CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2 +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { - // CHECK-LABEL: test_vpadalq_u8 return vpadalq_u8(a, b); - // CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <4 x i32> [[TMP3]] uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { - // CHECK-LABEL: test_vpadalq_u16 return vpadalq_u16(a, b); - // CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i64> [[TMP3]] uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { - // CHECK-LABEL: test_vpadalq_u32 return vpadalq_u32(a, b); - // CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 { +// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VQABS_V_I]] int8x8_t test_vqabs_s8(int8x8_t a) { - // CHECK-LABEL: test_vqabs_s8 return vqabs_s8(a); - // CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VQABSQ_V_I]] int8x16_t test_vqabsq_s8(int8x16_t a) { - // CHECK-LABEL: test_vqabsq_s8 return vqabsq_s8(a); - // CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #2 +// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqabs_s16(int16x4_t a) { - // CHECK-LABEL: test_vqabs_s16 return vqabs_s16(a); - // CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #2 +// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vqabsq_s16(int16x8_t a) { - // CHECK-LABEL: test_vqabsq_s16 return vqabsq_s16(a); - // CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #2 +// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqabs_s32(int32x2_t a) { - // CHECK-LABEL: test_vqabs_s32 return vqabs_s32(a); - // CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #2 +// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vqabsq_s32(int32x4_t a) { - // CHECK-LABEL: test_vqabsq_s32 return vqabsq_s32(a); - // CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) #2 +// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP1]] int64x2_t test_vqabsq_s64(int64x2_t a) { - // CHECK-LABEL: test_vqabsq_s64 return vqabsq_s64(a); - // CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 { +// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VQNEG_V_I]] int8x8_t test_vqneg_s8(int8x8_t a) { - // CHECK-LABEL: test_vqneg_s8 return vqneg_s8(a); - // CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 { +// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] int8x16_t test_vqnegq_s8(int8x16_t a) { - // CHECK-LABEL: test_vqnegq_s8 return vqnegq_s8(a); - // CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #2 +// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqneg_s16(int16x4_t a) { - // CHECK-LABEL: test_vqneg_s16 return vqneg_s16(a); - // CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #2 +// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vqnegq_s16(int16x8_t a) { - // CHECK-LABEL: test_vqnegq_s16 return vqnegq_s16(a); - // CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #2 +// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqneg_s32(int32x2_t a) { - // CHECK-LABEL: test_vqneg_s32 return vqneg_s32(a); - // CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #2 +// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vqnegq_s32(int32x4_t a) { - // CHECK-LABEL: test_vqnegq_s32 return vqnegq_s32(a); - // CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) #2 +// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP1]] int64x2_t test_vqnegq_s64(int64x2_t a) { - // CHECK-LABEL: test_vqnegq_s64 return vqnegq_s64(a); - // CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vneg_s8(int8x8_t a) { - // CHECK-LABEL: test_vneg_s8 return vneg_s8(a); - // CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vnegq_s8(int8x16_t a) { - // CHECK-LABEL: test_vnegq_s8 return vnegq_s8(a); - // CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vneg_s16(int16x4_t a) { - // CHECK-LABEL: test_vneg_s16 return vneg_s16(a); - // CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vnegq_s16(int16x8_t a) { - // CHECK-LABEL: test_vnegq_s16 return vnegq_s16(a); - // CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vneg_s32(int32x2_t a) { - // CHECK-LABEL: test_vneg_s32 return vneg_s32(a); - // CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vnegq_s32(int32x4_t a) { - // CHECK-LABEL: test_vnegq_s32 return vnegq_s32(a); - // CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vnegq_s64(int64x2_t a) { - // CHECK-LABEL: test_vnegq_s64 return vnegq_s64(a); - // CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> , %a +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vneg_f32(float32x2_t a) { - // CHECK-LABEL: test_vneg_f32 return vneg_f32(a); - // CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> , %a +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vnegq_f32(float32x4_t a) { - // CHECK-LABEL: test_vnegq_f32 return vnegq_f32(a); - // CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x double> , %a +// CHECK: ret <2 x double> [[SUB_I]] float64x2_t test_vnegq_f64(float64x2_t a) { - // CHECK-LABEL: test_vnegq_f64 return vnegq_f64(a); - // CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 { +// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VABS_I]] int8x8_t test_vabs_s8(int8x8_t a) { - // CHECK-LABEL: test_vabs_s8 return vabs_s8(a); - // CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VABS_I]] int8x16_t test_vabsq_s8(int8x16_t a) { - // CHECK-LABEL: test_vabsq_s8 return vabsq_s8(a); - // CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #2 +// CHECK: ret <4 x i16> [[VABS1_I]] int16x4_t test_vabs_s16(int16x4_t a) { - // CHECK-LABEL: test_vabs_s16 return vabs_s16(a); - // CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #2 +// CHECK: ret <8 x i16> [[VABS1_I]] int16x8_t test_vabsq_s16(int16x8_t a) { - // CHECK-LABEL: test_vabsq_s16 return vabsq_s16(a); - // CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #2 +// CHECK: ret <2 x i32> [[VABS1_I]] int32x2_t test_vabs_s32(int32x2_t a) { - // CHECK-LABEL: test_vabs_s32 return vabs_s32(a); - // CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #2 +// CHECK: ret <4 x i32> [[VABS1_I]] int32x4_t test_vabsq_s32(int32x4_t a) { - // CHECK-LABEL: test_vabsq_s32 return vabsq_s32(a); - // CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) #2 +// CHECK: ret <2 x i64> [[VABS1_I]] int64x2_t test_vabsq_s64(int64x2_t a) { - // CHECK-LABEL: test_vabsq_s64 return vabsq_s64(a); - // CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #2 +// CHECK: ret <2 x float> [[VABS1_I]] float32x2_t test_vabs_f32(float32x2_t a) { - // CHECK-LABEL: test_vabs_f32 return vabs_f32(a); - // CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #2 +// CHECK: ret <4 x float> [[VABS1_I]] float32x4_t test_vabsq_f32(float32x4_t a) { - // CHECK-LABEL: test_vabsq_f32 return vabsq_f32(a); - // CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vabsq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) #2 +// CHECK: ret <2 x double> [[VABS1_I]] float64x2_t test_vabsq_f64(float64x2_t a) { - // CHECK-LABEL: test_vabsq_f64 return vabsq_f64(a); - // CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VUQADD_I]] int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vuqadd_s8 return vuqadd_s8(a, b); - // CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VUQADD_I]] int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vuqaddq_s8 return vuqaddq_s8(a, b); - // CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) #2 +// CHECK: ret <4 x i16> [[VUQADD2_I]] int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vuqadd_s16 return vuqadd_s16(a, b); - // CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) #2 +// CHECK: ret <8 x i16> [[VUQADD2_I]] int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vuqaddq_s16 return vuqaddq_s16(a, b); - // CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) #2 +// CHECK: ret <2 x i32> [[VUQADD2_I]] int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vuqadd_s32 return vuqadd_s32(a, b); - // CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) #2 +// CHECK: ret <4 x i32> [[VUQADD2_I]] int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vuqaddq_s32 return vuqaddq_s32(a, b); - // CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) #2 +// CHECK: ret <2 x i64> [[VUQADD2_I]] int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vuqaddq_s64 return vuqaddq_s64(a, b); - // CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 { +// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VCLS_V_I]] int8x8_t test_vcls_s8(int8x8_t a) { - // CHECK-LABEL: test_vcls_s8 return vcls_s8(a); - // CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VCLSQ_V_I]] int8x16_t test_vclsq_s8(int8x16_t a) { - // CHECK-LABEL: test_vclsq_s8 return vclsq_s8(a); - // CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #2 +// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vcls_s16(int16x4_t a) { - // CHECK-LABEL: test_vcls_s16 return vcls_s16(a); - // CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #2 +// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vclsq_s16(int16x8_t a) { - // CHECK-LABEL: test_vclsq_s16 return vclsq_s16(a); - // CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #2 +// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vcls_s32(int32x2_t a) { - // CHECK-LABEL: test_vcls_s32 return vcls_s32(a); - // CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #2 +// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vclsq_s32(int32x4_t a) { - // CHECK-LABEL: test_vclsq_s32 return vclsq_s32(a); - // CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 { +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] int8x8_t test_vclz_s8(int8x8_t a) { - // CHECK-LABEL: test_vclz_s8 return vclz_s8(a); - // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2 +// CHECK: ret <16 x i8> [[VCLZQ_V_I]] int8x16_t test_vclzq_s8(int8x16_t a) { - // CHECK-LABEL: test_vclzq_s8 return vclzq_s8(a); - // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vclz_s16(int16x4_t a) { - // CHECK-LABEL: test_vclz_s16 return vclz_s16(a); - // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vclzq_s16(int16x8_t a) { - // CHECK-LABEL: test_vclzq_s16 return vclzq_s16(a); - // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vclz_s32(int32x2_t a) { - // CHECK-LABEL: test_vclz_s32 return vclz_s32(a); - // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vclzq_s32(int32x4_t a) { - // CHECK-LABEL: test_vclzq_s32 return vclzq_s32(a); - // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 { +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] uint8x8_t test_vclz_u8(uint8x8_t a) { - // CHECK-LABEL: test_vclz_u8 return vclz_u8(a); - // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 { +// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2 +// CHECK: ret <16 x i8> [[VCLZQ_V_I]] uint8x16_t test_vclzq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vclzq_u8 return vclzq_u8(a); - // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] uint16x4_t test_vclz_u16(uint16x4_t a) { - // CHECK-LABEL: test_vclz_u16 return vclz_u16(a); - // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] uint16x8_t test_vclzq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vclzq_u16 return vclzq_u16(a); - // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vclz_u32(uint32x2_t a) { - // CHECK-LABEL: test_vclz_u32 return vclz_u32(a); - // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] uint32x4_t test_vclzq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vclzq_u32 return vclzq_u32(a); - // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VCNT_V_I]] int8x8_t test_vcnt_s8(int8x8_t a) { - // CHECK-LABEL: test_vcnt_s8 return vcnt_s8(a); - // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] int8x16_t test_vcntq_s8(int8x16_t a) { - // CHECK-LABEL: test_vcntq_s8 return vcntq_s8(a); - // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VCNT_V_I]] uint8x8_t test_vcnt_u8(uint8x8_t a) { - // CHECK-LABEL: test_vcnt_u8 return vcnt_u8(a); - // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] uint8x16_t test_vcntq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vcntq_u8 return vcntq_u8(a); - // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VCNT_V_I]] poly8x8_t test_vcnt_p8(poly8x8_t a) { - // CHECK-LABEL: test_vcnt_p8 return vcnt_p8(a); - // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] poly8x16_t test_vcntq_p8(poly8x16_t a) { - // CHECK-LABEL: test_vcntq_p8 return vcntq_p8(a); - // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] int8x8_t test_vmvn_s8(int8x8_t a) { - // CHECK-LABEL: test_vmvn_s8 return vmvn_s8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] int8x16_t test_vmvnq_s8(int8x16_t a) { - // CHECK-LABEL: test_vmvnq_s8 return vmvnq_s8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, +// CHECK: ret <4 x i16> [[NEG_I]] int16x4_t test_vmvn_s16(int16x4_t a) { - // CHECK-LABEL: test_vmvn_s16 return vmvn_s16(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, +// CHECK: ret <8 x i16> [[NEG_I]] int16x8_t test_vmvnq_s16(int16x8_t a) { - // CHECK-LABEL: test_vmvnq_s16 return vmvnq_s16(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, +// CHECK: ret <2 x i32> [[NEG_I]] int32x2_t test_vmvn_s32(int32x2_t a) { - // CHECK-LABEL: test_vmvn_s32 return vmvn_s32(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, +// CHECK: ret <4 x i32> [[NEG_I]] int32x4_t test_vmvnq_s32(int32x4_t a) { - // CHECK-LABEL: test_vmvnq_s32 return vmvnq_s32(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] uint8x8_t test_vmvn_u8(uint8x8_t a) { - // CHECK-LABEL: test_vmvn_u8 return vmvn_u8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] uint8x16_t test_vmvnq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vmvnq_u8 return vmvnq_u8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, +// CHECK: ret <4 x i16> [[NEG_I]] uint16x4_t test_vmvn_u16(uint16x4_t a) { - // CHECK-LABEL: test_vmvn_u16 return vmvn_u16(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, +// CHECK: ret <8 x i16> [[NEG_I]] uint16x8_t test_vmvnq_u16(uint16x8_t a) { - // CHECK-LABEL: test_vmvnq_u16 return vmvnq_u16(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, +// CHECK: ret <2 x i32> [[NEG_I]] uint32x2_t test_vmvn_u32(uint32x2_t a) { - // CHECK-LABEL: test_vmvn_u32 return vmvn_u32(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, +// CHECK: ret <4 x i32> [[NEG_I]] uint32x4_t test_vmvnq_u32(uint32x4_t a) { - // CHECK-LABEL: test_vmvnq_u32 return vmvnq_u32(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] poly8x8_t test_vmvn_p8(poly8x8_t a) { - // CHECK-LABEL: test_vmvn_p8 return vmvn_p8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] poly8x16_t test_vmvnq_p8(poly8x16_t a) { - // CHECK-LABEL: test_vmvnq_p8 return vmvnq_p8(a); - // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VRBIT_I]] int8x8_t test_vrbit_s8(int8x8_t a) { - // CHECK-LABEL: test_vrbit_s8 return vrbit_s8(a); - // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VRBIT_I]] int8x16_t test_vrbitq_s8(int8x16_t a) { - // CHECK-LABEL: test_vrbitq_s8 return vrbitq_s8(a); - // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vrbit_u8(<8 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VRBIT_I]] uint8x8_t test_vrbit_u8(uint8x8_t a) { - // CHECK-LABEL: test_vrbit_u8 return vrbit_u8(a); - // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vrbitq_u8(<16 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VRBIT_I]] uint8x16_t test_vrbitq_u8(uint8x16_t a) { - // CHECK-LABEL: test_vrbitq_u8 return vrbitq_u8(a); - // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vrbit_p8(<8 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2 +// CHECK: ret <8 x i8> [[VRBIT_I]] poly8x8_t test_vrbit_p8(poly8x8_t a) { - // CHECK-LABEL: test_vrbit_p8 return vrbit_p8(a); - // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vrbitq_p8(<16 x i8> %a) #0 { +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2 +// CHECK: ret <16 x i8> [[VRBIT_I]] poly8x16_t test_vrbitq_p8(poly8x16_t a) { - // CHECK-LABEL: test_vrbitq_p8 return vrbitq_p8(a); - // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VMOVN_I]] int8x8_t test_vmovn_s16(int16x8_t a) { - // CHECK-LABEL: test_vmovn_s16 return vmovn_s16(a); - // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: ret <4 x i16> [[VMOVN_I]] int16x4_t test_vmovn_s32(int32x4_t a) { - // CHECK-LABEL: test_vmovn_s32 return vmovn_s32(a); - // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[VMOVN_I]] int32x2_t test_vmovn_s64(int64x2_t a) { - // CHECK-LABEL: test_vmovn_s64 return vmovn_s64(a); - // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VMOVN_I]] uint8x8_t test_vmovn_u16(uint16x8_t a) { - // CHECK-LABEL: test_vmovn_u16 return vmovn_u16(a); - // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: ret <4 x i16> [[VMOVN_I]] uint16x4_t test_vmovn_u32(uint32x4_t a) { - // CHECK-LABEL: test_vmovn_u32 return vmovn_u32(a); - // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[VMOVN_I]] uint32x2_t test_vmovn_u64(uint64x2_t a) { - // CHECK-LABEL: test_vmovn_u64 return vmovn_u64(a); - // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vmovn_high_s16 return vmovn_high_s16(a, b); - // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vmovn_high_s32 return vmovn_high_s32(a, b); - // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vmovn_high_s64 return vmovn_high_s64(a, b); - // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vmovn_high_u16 return vmovn_high_u16(a, b); - // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vmovn_high_u32 return vmovn_high_u32(a, b); - // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vmovn_high_u64 return vmovn_high_u64(a, b); - // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #2 +// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] int8x8_t test_vqmovun_s16(int16x8_t a) { - // CHECK-LABEL: test_vqmovun_s16 return vqmovun_s16(a); - // CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #2 +// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqmovun_s32(int32x4_t a) { - // CHECK-LABEL: test_vqmovun_s32 return vqmovun_s32(a); - // CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #2 +// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqmovun_s64(int64x2_t a) { - // CHECK-LABEL: test_vqmovun_s64 return vqmovun_s64(a); - // CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) #2 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqmovun_high_s16 return vqmovun_high_s16(a, b); - // CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) #2 +// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqmovun_high_s32 return vqmovun_high_s32(a, b); - // CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) #2 +// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqmovun_high_s64 return vqmovun_high_s64(a, b); - // CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2 +// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] int8x8_t test_vqmovn_s16(int16x8_t a) { - // CHECK-LABEL: test_vqmovn_s16 return vqmovn_s16(a); - // CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqmovn_s32(int32x4_t a) { - // CHECK-LABEL: test_vqmovn_s32 return vqmovn_s32(a); - // CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqmovn_s64(int64x2_t a) { - // CHECK-LABEL: test_vqmovn_s64 return vqmovn_s64(a); - // CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vqmovn_high_s16 return vqmovn_high_s16(a, b); - // CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vqmovn_high_s32 return vqmovn_high_s32(a, b); - // CHECK: sqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vqmovn_high_s64 return vqmovn_high_s64(a, b); - // CHECK: sqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2 +// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] uint8x8_t test_vqmovn_u16(uint16x8_t a) { - // CHECK-LABEL: test_vqmovn_u16 return vqmovn_u16(a); - // CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] uint16x4_t test_vqmovn_u32(uint32x4_t a) { - // CHECK-LABEL: test_vqmovn_u32 return vqmovn_u32(a); - // CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vqmovn_u64(uint64x2_t a) { - // CHECK-LABEL: test_vqmovn_u64 return vqmovn_u64(a); - // CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vqmovn_high_u16 return vqmovn_high_u16(a, b); - // CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vqmovn_high_u32 return vqmovn_high_u32(a, b); - // CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2 +// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vqmovn_high_u64 return vqmovn_high_u64(a, b); - // CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] int16x8_t test_vshll_n_s8(int8x8_t a) { - // CHECK-LABEL: test_vshll_n_s8 return vshll_n_s8(a, 8); - // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8 } +// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] int32x4_t test_vshll_n_s16(int16x4_t a) { - // CHECK-LABEL: test_vshll_n_s16 return vshll_n_s16(a, 16); - // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16 } +// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] int64x2_t test_vshll_n_s32(int32x2_t a) { - // CHECK-LABEL: test_vshll_n_s32 return vshll_n_s32(a, 32); - // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32 } +// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] uint16x8_t test_vshll_n_u8(uint8x8_t a) { - // CHECK-LABEL: test_vshll_n_u8 return vshll_n_u8(a, 8); - // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8 } +// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] uint32x4_t test_vshll_n_u16(uint16x4_t a) { - // CHECK-LABEL: test_vshll_n_u16 return vshll_n_u16(a, 16); - // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16 } +// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] uint64x2_t test_vshll_n_u32(uint32x2_t a) { - // CHECK-LABEL: test_vshll_n_u32 return vshll_n_u32(a, 32); - // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32 } +// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] int16x8_t test_vshll_high_n_s8(int8x16_t a) { - // CHECK-LABEL: test_vshll_high_n_s8 return vshll_high_n_s8(a, 8); - // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8 } +// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] int32x4_t test_vshll_high_n_s16(int16x8_t a) { - // CHECK-LABEL: test_vshll_high_n_s16 return vshll_high_n_s16(a, 16); - // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16 } +// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] int64x2_t test_vshll_high_n_s32(int32x4_t a) { - // CHECK-LABEL: test_vshll_high_n_s32 return vshll_high_n_s32(a, 32); - // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32 } +// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { - // CHECK-LABEL: test_vshll_high_n_u8 return vshll_high_n_u8(a, 8); - // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8 } +// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { - // CHECK-LABEL: test_vshll_high_n_u16 return vshll_high_n_u16(a, 16); - // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16 } +// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { - // CHECK-LABEL: test_vshll_high_n_u32 return vshll_high_n_u32(a, 32); - // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32 } +// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #2 +// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> +// CHECK: ret <4 x half> [[TMP1]] float16x4_t test_vcvt_f16_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvt_f16_f32 return vcvt_f16_f32(a); - // CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <8 x half> @test_vcvt_high_f16_f32(<4 x half> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) #2 +// CHECK: [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[SHUFFLE_I_I]] float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) { - //CHECK-LABEL: test_vcvt_high_f16_f32 return vcvt_high_f16_f32(a, b); - // CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float> +// CHECK: ret <2 x float> [[VCVT_I]] float32x2_t test_vcvt_f32_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvt_f32_f64 return vcvt_f32_f64(a); - // CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float> +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I_I]] float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) { - //CHECK-LABEL: test_vcvt_high_f32_f64 return vcvt_high_f32_f64(a, b); - // CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) #2 +// CHECK: ret <2 x float> [[VCVTX_F32_V1_I]] float32x2_t test_vcvtx_f32_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtx_f32_f64 return vcvtx_f32_f64(a); - // CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) #2 +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I_I]] float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { - //CHECK-LABEL: test_vcvtx_high_f32_f64 return vcvtx_high_f32_f64(a, b); - // CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2 +// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vcvt_f32_f16(float16x4_t a) { - //CHECK-LABEL: test_vcvt_f32_f16 return vcvt_f32_f16(a); - // CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h } +// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f16(<8 x half> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2 +// CHECK: [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vcvt_high_f32_f16(float16x8_t a) { - //CHECK-LABEL: test_vcvt_high_f32_f16 return vcvt_high_f32_f16(a); - // CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h } +// CHECK-LABEL: define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> +// CHECK: ret <2 x double> [[VCVT_I]] float64x2_t test_vcvt_f64_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvt_f64_f32 return vcvt_f64_f32(a); - // CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> +// CHECK: ret <2 x double> [[VCVT_I_I]] float64x2_t test_vcvt_high_f64_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvt_high_f64_f32 return vcvt_high_f64_f32(a); - // CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> [[VRNDN_I]]) #2 +// CHECK: ret <2 x float> [[VRNDN1_I]] float32x2_t test_vrndn_f32(float32x2_t a) { - //CHECK-LABEL: test_vrndn_f32 return vrndn_f32(a); - // CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> [[VRNDN_I]]) #2 +// CHECK: ret <4 x float> [[VRNDN1_I]] float32x4_t test_vrndnq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndnq_f32 return vrndnq_f32(a); - // CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> [[VRNDN_I]]) #2 +// CHECK: ret <2 x double> [[VRNDN1_I]] float64x2_t test_vrndnq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndnq_f64 return vrndnq_f64(a); - // CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) #2 +// CHECK: ret <2 x float> [[VRNDA1_I]] float32x2_t test_vrnda_f32(float32x2_t a) { - //CHECK-LABEL: test_vrnda_f32 return vrnda_f32(a); - // CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) #2 +// CHECK: ret <4 x float> [[VRNDA1_I]] float32x4_t test_vrndaq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndaq_f32 return vrndaq_f32(a); - // CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) #2 +// CHECK: ret <2 x double> [[VRNDA1_I]] float64x2_t test_vrndaq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndaq_f64 return vrndaq_f64(a); - // CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) #2 +// CHECK: ret <2 x float> [[VRNDP1_I]] float32x2_t test_vrndp_f32(float32x2_t a) { - //CHECK-LABEL: test_vrndp_f32 return vrndp_f32(a); - // CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) #2 +// CHECK: ret <4 x float> [[VRNDP1_I]] float32x4_t test_vrndpq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndpq_f32 return vrndpq_f32(a); - // CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) #2 +// CHECK: ret <2 x double> [[VRNDP1_I]] float64x2_t test_vrndpq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndpq_f64 return vrndpq_f64(a); - // CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) #2 +// CHECK: ret <2 x float> [[VRNDM1_I]] float32x2_t test_vrndm_f32(float32x2_t a) { - //CHECK-LABEL: test_vrndm_f32 return vrndm_f32(a); - // CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) #2 +// CHECK: ret <4 x float> [[VRNDM1_I]] float32x4_t test_vrndmq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndmq_f32 return vrndmq_f32(a); - // CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) #2 +// CHECK: ret <2 x double> [[VRNDM1_I]] float64x2_t test_vrndmq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndmq_f64 return vrndmq_f64(a); - // CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) #2 +// CHECK: ret <2 x float> [[VRNDX1_I]] float32x2_t test_vrndx_f32(float32x2_t a) { - //CHECK-LABEL: test_vrndx_f32 return vrndx_f32(a); - // CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) #2 +// CHECK: ret <4 x float> [[VRNDX1_I]] float32x4_t test_vrndxq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndxq_f32 return vrndxq_f32(a); - // CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) #2 +// CHECK: ret <2 x double> [[VRNDX1_I]] float64x2_t test_vrndxq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndxq_f64 return vrndxq_f64(a); - // CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) #2 +// CHECK: ret <2 x float> [[VRNDZ1_I]] float32x2_t test_vrnd_f32(float32x2_t a) { - //CHECK-LABEL: test_vrnd_f32 return vrnd_f32(a); - // CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) #2 +// CHECK: ret <4 x float> [[VRNDZ1_I]] float32x4_t test_vrndq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndq_f32 return vrndq_f32(a); - // CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) #2 +// CHECK: ret <2 x double> [[VRNDZ1_I]] float64x2_t test_vrndq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndq_f64 return vrndq_f64(a); - // CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I]]) #2 +// CHECK: ret <2 x float> [[VRNDI1_I]] float32x2_t test_vrndi_f32(float32x2_t a) { - //CHECK-LABEL: test_vrndi_f32 return vrndi_f32(a); - // CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I]]) #2 +// CHECK: ret <4 x float> [[VRNDI1_I]] float32x4_t test_vrndiq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrndiq_f32 return vrndiq_f32(a); - // CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDI_I]]) #2 +// CHECK: ret <2 x double> [[VRNDI1_I]] float64x2_t test_vrndiq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrndiq_f64 return vrndiq_f64(a); - // CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vcvt_s32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvt_s32_f32 return vcvt_s32_f32(a); - // CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vcvtq_s32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtq_s32_f32 return vcvtq_s32_f32(a); - // CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vcvtq_s64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtq_s64_f64 return vcvtq_s64_f64(a); - // CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vcvt_u32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvt_u32_f32 return vcvt_u32_f32(a); - // CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtq_u32_f32 return vcvtq_u32_f32(a); - // CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[TMP2:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vcvtq_u64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtq_u64_f64 return vcvtq_u64_f64(a); - // CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTN1_I]] int32x2_t test_vcvtn_s32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtn_s32_f32 return vcvtn_s32_f32(a); - // CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTN1_I]] int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtnq_s32_f32 return vcvtnq_s32_f32(a); - // CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTN1_I]] int64x2_t test_vcvtnq_s64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtnq_s64_f64 return vcvtnq_s64_f64(a); - // CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTN1_I]] uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtn_u32_f32 return vcvtn_u32_f32(a); - // CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTN1_I]] uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtnq_u32_f32 return vcvtnq_u32_f32(a); - // CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTN1_I]] uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtnq_u64_f64 return vcvtnq_u64_f64(a); - // CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTP1_I]] int32x2_t test_vcvtp_s32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtp_s32_f32 return vcvtp_s32_f32(a); - // CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTP1_I]] int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtpq_s32_f32 return vcvtpq_s32_f32(a); - // CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTP1_I]] int64x2_t test_vcvtpq_s64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtpq_s64_f64 return vcvtpq_s64_f64(a); - // CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTP1_I]] uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtp_u32_f32 return vcvtp_u32_f32(a); - // CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTP1_I]] uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtpq_u32_f32 return vcvtpq_u32_f32(a); - // CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTP1_I]] uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtpq_u64_f64 return vcvtpq_u64_f64(a); - // CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTM1_I]] int32x2_t test_vcvtm_s32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtm_s32_f32 return vcvtm_s32_f32(a); - // CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTM1_I]] int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtmq_s32_f32 return vcvtmq_s32_f32(a); - // CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTM1_I]] int64x2_t test_vcvtmq_s64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtmq_s64_f64 return vcvtmq_s64_f64(a); - // CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTM1_I]] uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvtm_u32_f32 return vcvtm_u32_f32(a); - // CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTM1_I]] uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtmq_u32_f32 return vcvtmq_u32_f32(a); - // CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTM1_I]] uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtmq_u64_f64 return vcvtmq_u64_f64(a); - // CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTA1_I]] int32x2_t test_vcvta_s32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvta_s32_f32 return vcvta_s32_f32(a); - // CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTA1_I]] int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtaq_s32_f32 return vcvtaq_s32_f32(a); - // CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTA1_I]] int64x2_t test_vcvtaq_s64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtaq_s64_f64 return vcvtaq_s64_f64(a); - // CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTA1_I]] uint32x2_t test_vcvta_u32_f32(float32x2_t a) { - //CHECK-LABEL: test_vcvta_u32_f32 return vcvta_u32_f32(a); - // CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTA1_I]] uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { - //CHECK-LABEL: test_vcvtaq_u32_f32 return vcvtaq_u32_f32(a); - // CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2 +// CHECK: ret <2 x i64> [[VCVTA1_I]] uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) { - //CHECK-LABEL: test_vcvtaq_u64_f64 return vcvtaq_u64_f64(a); - // CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #2 +// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] float32x2_t test_vrsqrte_f32(float32x2_t a) { - //CHECK-LABEL: test_vrsqrte_f32 return vrsqrte_f32(a); - // CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #2 +// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] float32x4_t test_vrsqrteq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrsqrteq_f32 return vrsqrteq_f32(a); - // CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) #2 +// CHECK: ret <2 x double> [[VRSQRTEQ_V1_I]] float64x2_t test_vrsqrteq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrsqrteq_f64 return vrsqrteq_f64(a); - // CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #2 +// CHECK: ret <2 x float> [[VRECPE_V1_I]] float32x2_t test_vrecpe_f32(float32x2_t a) { - //CHECK-LABEL: test_vrecpe_f32 return vrecpe_f32(a); - // CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #2 +// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] float32x4_t test_vrecpeq_f32(float32x4_t a) { - //CHECK-LABEL: test_vrecpeq_f32 return vrecpeq_f32(a); - // CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) #2 +// CHECK: ret <2 x double> [[VRECPEQ_V1_I]] float64x2_t test_vrecpeq_f64(float64x2_t a) { - //CHECK-LABEL: test_vrecpeq_f64 return vrecpeq_f64(a); - // CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #2 +// CHECK: ret <2 x i32> [[VRECPE_V1_I]] uint32x2_t test_vrecpe_u32(uint32x2_t a) { - //CHECK-LABEL: test_vrecpe_u32 return vrecpe_u32(a); - // CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #2 +// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] uint32x4_t test_vrecpeq_u32(uint32x4_t a) { - //CHECK-LABEL: test_vrecpeq_u32 return vrecpeq_u32(a); - // CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP1]]) #2 +// CHECK: ret <2 x float> [[VSQRT_I]] float32x2_t test_vsqrt_f32(float32x2_t a) { - //CHECK-LABEL: test_vsqrt_f32 return vsqrt_f32(a); - // CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) #2 +// CHECK: ret <4 x float> [[VSQRT_I]] float32x4_t test_vsqrtq_f32(float32x4_t a) { - //CHECK-LABEL: test_vsqrtq_f32 return vsqrtq_f32(a); - // CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) #2 +// CHECK: ret <2 x double> [[VSQRT_I]] float64x2_t test_vsqrtq_f64(float64x2_t a) { - //CHECK-LABEL: test_vsqrtq_f64 return vsqrtq_f64(a); - // CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK: ret <2 x float> [[VCVT_I]] float32x2_t test_vcvt_f32_s32(int32x2_t a) { - //CHECK-LABEL: test_vcvt_f32_s32 return vcvt_f32_s32(a); - //CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK: ret <2 x float> [[VCVT_I]] float32x2_t test_vcvt_f32_u32(uint32x2_t a) { - //CHECK-LABEL: test_vcvt_f32_u32 return vcvt_f32_u32(a); - //CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s } +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK: ret <4 x float> [[VCVT_I]] float32x4_t test_vcvtq_f32_s32(int32x4_t a) { - //CHECK-LABEL: test_vcvtq_f32_s32 return vcvtq_f32_s32(a); - //CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK: ret <4 x float> [[VCVT_I]] float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { - //CHECK-LABEL: test_vcvtq_f32_u32 return vcvtq_f32_u32(a); - //CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +// CHECK: ret <2 x double> [[VCVT_I]] float64x2_t test_vcvtq_f64_s64(int64x2_t a) { - //CHECK-LABEL: test_vcvtq_f64_s64 return vcvtq_f64_s64(a); - //CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } +// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> +// CHECK: ret <2 x double> [[VCVT_I]] float64x2_t test_vcvtq_f64_u64(uint64x2_t a) { - //CHECK-LABEL: test_vcvtq_f64_u64 return vcvtq_f64_u64(a); - //CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d } Index: test/CodeGen/aarch64-neon-perm.c =================================================================== --- test/CodeGen/aarch64-neon-perm.c +++ test/CodeGen/aarch64-neon-perm.c @@ -1,1092 +1,2279 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vuzp1_s8 return vuzp1_s8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vuzp1q_s8 return vuzp1q_s8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vuzp1_s16 return vuzp1_s16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vuzp1q_s16 return vuzp1q_s16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vuzp1_s32 return vuzp1_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vuzp1q_s32 return vuzp1q_s32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vuzp1q_s64 return vuzp1q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vuzp1_u8 return vuzp1_u8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vuzp1q_u8 return vuzp1q_u8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vuzp1_u16 return vuzp1_u16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vuzp1q_u16 return vuzp1q_u16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vuzp1_u32 return vuzp1_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vuzp1q_u32 return vuzp1q_u32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vuzp1q_u64 return vuzp1q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vuzp1_f32 return vuzp1_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vuzp1q_f32 return vuzp1q_f32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vuzp1q_f64 return vuzp1q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vuzp1_p8 return vuzp1_p8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vuzp1q_p8 return vuzp1q_p8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vuzp1_p16 return vuzp1_p16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vuzp1q_p16 return vuzp1q_p16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vuzp2_s8 return vuzp2_s8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vuzp2q_s8 return vuzp2q_s8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vuzp2_s16 return vuzp2_s16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vuzp2q_s16 return vuzp2q_s16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vuzp2_s32 return vuzp2_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vuzp2q_s32 return vuzp2q_s32(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vuzp2q_s64 return vuzp2q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vuzp2_u8 return vuzp2_u8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vuzp2q_u8 return vuzp2q_u8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vuzp2_u16 return vuzp2_u16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vuzp2q_u16 return vuzp2q_u16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vuzp2_u32 return vuzp2_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vuzp2q_u32 return vuzp2q_u32(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vuzp2q_u64 return vuzp2q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vuzp2_f32 return vuzp2_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vuzp2q_f32 return vuzp2q_f32(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vuzp2q_f64 return vuzp2q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vuzp2_p8 return vuzp2_p8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vuzp2q_p8 return vuzp2q_p8(a, b); - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vuzp2_p16 return vuzp2_p16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vuzp2q_p16 return vuzp2q_p16(a, b); - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vzip1_s8 return vzip1_s8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vzip1q_s8 return vzip1q_s8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vzip1_s16 return vzip1_s16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vzip1q_s16 return vzip1q_s16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vzip1_s32 return vzip1_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vzip1q_s32 return vzip1q_s32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vzip1q_s64 return vzip1q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vzip1_u8 return vzip1_u8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vzip1q_u8 return vzip1q_u8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vzip1_u16 return vzip1_u16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vzip1q_u16 return vzip1q_u16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vzip1_u32 return vzip1_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vzip1q_u32 return vzip1q_u32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vzip1q_u64 return vzip1q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vzip1_f32 return vzip1_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vzip1q_f32 return vzip1q_f32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vzip1q_f64 return vzip1q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vzip1_p8 return vzip1_p8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vzip1q_p8 return vzip1q_p8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vzip1_p16 return vzip1_p16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vzip1q_p16 return vzip1q_p16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vzip2_s8 return vzip2_s8(a, b); - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vzip2q_s8 return vzip2q_s8(a, b); - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vzip2_s16 return vzip2_s16(a, b); - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vzip2q_s16 return vzip2q_s16(a, b); - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vzip2_s32 return vzip2_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vzip2q_s32 return vzip2q_s32(a, b); - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vzip2q_s64 return vzip2q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vzip2_u8 return vzip2_u8(a, b); - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vzip2q_u8 return vzip2q_u8(a, b); - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vzip2_u16 return vzip2_u16(a, b); - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vzip2q_u16 return vzip2q_u16(a, b); - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vzip2_u32 return vzip2_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vzip2q_u32 return vzip2q_u32(a, b); - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vzip2q_u64 return vzip2q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vzip2_f32 return vzip2_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vzip2q_f32 return vzip2q_f32(a, b); - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vzip2q_f64 return vzip2q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vzip2_p8 return vzip2_p8(a, b); - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vzip2q_p8 return vzip2q_p8(a, b); - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vzip2_p16 return vzip2_p16(a, b); - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vzip2q_p16 return vzip2q_p16(a, b); - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vtrn1_s8 return vtrn1_s8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vtrn1q_s8 return vtrn1q_s8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vtrn1_s16 return vtrn1_s16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vtrn1q_s16 return vtrn1q_s16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vtrn1_s32 return vtrn1_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vtrn1q_s32 return vtrn1q_s32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vtrn1q_s64 return vtrn1q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtrn1_u8 return vtrn1_u8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vtrn1q_u8 return vtrn1q_u8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vtrn1_u16 return vtrn1_u16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vtrn1q_u16 return vtrn1q_u16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vtrn1_u32 return vtrn1_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vtrn1q_u32 return vtrn1q_u32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vtrn1q_u64 return vtrn1q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vtrn1_f32 return vtrn1_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vtrn1q_f32 return vtrn1q_f32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vtrn1q_f64 return vtrn1q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vtrn1_p8 return vtrn1_p8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vtrn1q_p8 return vtrn1q_p8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vtrn1_p16 return vtrn1_p16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vtrn1q_p16 return vtrn1q_p16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vtrn2_s8 return vtrn2_s8(a, b); - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vtrn2q_s8 return vtrn2q_s8(a, b); - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vtrn2_s16 return vtrn2_s16(a, b); - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vtrn2q_s16 return vtrn2q_s16(a, b); - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vtrn2_s32 return vtrn2_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vtrn2q_s32 return vtrn2q_s32(a, b); - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) { - // CHECK-LABEL: test_vtrn2q_s64 return vtrn2q_s64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtrn2_u8 return vtrn2_u8(a, b); - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vtrn2q_u8 return vtrn2q_u8(a, b); - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vtrn2_u16 return vtrn2_u16(a, b); - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vtrn2q_u16 return vtrn2q_u16(a, b); - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vtrn2_u32 return vtrn2_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vtrn2q_u32 return vtrn2q_u32(a, b); - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) { - // CHECK-LABEL: test_vtrn2q_u64 return vtrn2q_u64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vtrn2_f32 return vtrn2_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vtrn2q_f32 return vtrn2q_f32(a, b); - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) { - // CHECK-LABEL: test_vtrn2q_f64 return vtrn2q_f64(a, b); - // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}} } +// CHECK-LABEL: define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vtrn2_p8 return vtrn2_p8(a, b); - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vtrn2q_p8 return vtrn2q_p8(a, b); - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vtrn2_p16 return vtrn2_p16(a, b); - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vtrn2q_p16 return vtrn2q_p16(a, b); - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vuzp_s8 return vuzp_s8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vuzp_s16 return vuzp_s16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vuzp_s32 return vuzp_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vuzp_u8 return vuzp_u8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vuzp_u16 return vuzp_u16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vuzp_u32 return vuzp_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vuzp_f32 return vuzp_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vuzp_p8 return vuzp_p8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vuzp_p16 return vuzp_p16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vuzpq_s8 return vuzpq_s8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vuzpq_s16 return vuzpq_s16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vuzpq_s32 return vuzpq_s32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vuzpq_u8 return vuzpq_u8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vuzpq_u16 return vuzpq_u16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vuzpq_u32 return vuzpq_u32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vuzpq_f32 return vuzpq_f32(a, b); - // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vuzpq_p8 return vuzpq_p8(a, b); - // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vuzpq_p16 return vuzpq_p16(a, b); - // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vzip_s8 return vzip_s8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vzip_s16 return vzip_s16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vzip_s32 return vzip_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vzip_u8 return vzip_u8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vzip_u16 return vzip_u16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vzip_u32 return vzip_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vzip_f32 return vzip_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vzip_p8 return vzip_p8(a, b); - // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vzip_p16 return vzip_p16(a, b); - // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vzipq_s8 return vzipq_s8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vzipq_s16 return vzipq_s16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vzipq_s32 return vzipq_s32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vzipq_u8 return vzipq_u8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vzipq_u16 return vzipq_u16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vzipq_u32 return vzipq_u32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vzipq_f32 return vzipq_f32(a, b); - // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vzipq_p8 return vzipq_p8(a, b); - // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vzipq_p16 return vzipq_p16(a, b); - // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vtrn_s8 return vtrn_s8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { - // CHECK-LABEL: test_vtrn_s16 return vtrn_s16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { - // CHECK-LABEL: test_vtrn_s32 return vtrn_s32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtrn_u8 return vtrn_u8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { - // CHECK-LABEL: test_vtrn_u16 return vtrn_u16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { - // CHECK-LABEL: test_vtrn_u32 return vtrn_u32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vtrn_f32 return vtrn_f32(a, b); - // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} - // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} } +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vtrn_p8 return vtrn_p8(a, b); - // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { - // CHECK-LABEL: test_vtrn_p16 return vtrn_p16(a, b); - // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h - // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h } +// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vtrnq_s8 return vtrnq_s8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { - // CHECK-LABEL: test_vtrnq_s16 return vtrnq_s16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { - // CHECK-LABEL: test_vtrnq_s32 return vtrnq_s32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vtrnq_u8 return vtrnq_u8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { - // CHECK-LABEL: test_vtrnq_u16 return vtrnq_u16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { - // CHECK-LABEL: test_vtrnq_u32 return vtrnq_u32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vtrnq_f32 return vtrnq_f32(a, b); - // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s - // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s } +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { - // CHECK-LABEL: test_vtrnq_p8 return vtrnq_p8(a, b); - // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b - // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { - // CHECK-LABEL: test_vtrnq_p16 return vtrnq_p16(a, b); - // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h } Index: test/CodeGen/aarch64-neon-scalar-copy.c =================================================================== --- test/CodeGen/aarch64-neon-scalar-copy.c +++ test/CodeGen/aarch64-neon-scalar-copy.c @@ -1,173 +1,228 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s #include -// CHECK-LABEL: test_vdups_lane_f32 +// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: ret float [[VDUPS_LANE]] float32_t test_vdups_lane_f32(float32x2_t a) { return vdups_lane_f32(a, 1); -// CHECK: ret -// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vdupd_lane_f64 +// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: ret double [[VDUPD_LANE]] float64_t test_vdupd_lane_f64(float64x1_t a) { return vdupd_lane_f64(a, 0); -// CHECK: ret -// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0] } -// CHECK-LABEL: test_vdups_laneq_f32 +// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: ret float [[VGETQ_LANE]] float32_t test_vdups_laneq_f32(float32x4_t a) { return vdups_laneq_f32(a, 3); -// CHECK: ret -// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vdupd_laneq_f64 +// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: ret double [[VGETQ_LANE]] float64_t test_vdupd_laneq_f64(float64x2_t a) { return vdupd_laneq_f64(a, 1); -// CHECK: ret -// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vdupb_lane_s8 +// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] int8_t test_vdupb_lane_s8(int8x8_t a) { return vdupb_lane_s8(a, 7); -// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7] } -// CHECK-LABEL: test_vduph_lane_s16 +// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] int16_t test_vduph_lane_s16(int16x4_t a) { return vduph_lane_s16(a, 3); -// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vdups_lane_s32 +// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] int32_t test_vdups_lane_s32(int32x2_t a) { return vdups_lane_s32(a, 1); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vdupd_lane_s64 +// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] int64_t test_vdupd_lane_s64(int64x1_t a) { return vdupd_lane_s64(a, 0); -// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vdupb_lane_u8 +// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] uint8_t test_vdupb_lane_u8(uint8x8_t a) { return vdupb_lane_u8(a, 7); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7] } -// CHECK-LABEL: test_vduph_lane_u16 +// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] uint16_t test_vduph_lane_u16(uint16x4_t a) { return vduph_lane_u16(a, 3); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vdups_lane_u32 +// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] uint32_t test_vdups_lane_u32(uint32x2_t a) { return vdups_lane_u32(a, 1); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vdupd_lane_u64 +// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] uint64_t test_vdupd_lane_u64(uint64x1_t a) { return vdupd_lane_u64(a, 0); -// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} } -// CHECK-LABEL: test_vdupb_laneq_s8 +// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] int8_t test_vdupb_laneq_s8(int8x16_t a) { return vdupb_laneq_s8(a, 15); -// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15] } -// CHECK-LABEL: test_vduph_laneq_s16 +// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] int16_t test_vduph_laneq_s16(int16x8_t a) { return vduph_laneq_s16(a, 7); -// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vdups_laneq_s32 +// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] int32_t test_vdups_laneq_s32(int32x4_t a) { return vdups_laneq_s32(a, 3); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vdupd_laneq_s64 +// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] int64_t test_vdupd_laneq_s64(int64x2_t a) { return vdupd_laneq_s64(a, 1); -// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vdupb_laneq_u8 +// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] uint8_t test_vdupb_laneq_u8(uint8x16_t a) { return vdupb_laneq_u8(a, 15); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15] } -// CHECK-LABEL: test_vduph_laneq_u16 +// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] uint16_t test_vduph_laneq_u16(uint16x8_t a) { return vduph_laneq_u16(a, 7); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vdups_laneq_u32 +// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] uint32_t test_vdups_laneq_u32(uint32x4_t a) { return vdups_laneq_u32(a, 3); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vdupd_laneq_u64 +// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] uint64_t test_vdupd_laneq_u64(uint64x2_t a) { return vdupd_laneq_u64(a, 1); -// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vdupb_lane_p8 +// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] poly8_t test_vdupb_lane_p8(poly8x8_t a) { return vdupb_lane_p8(a, 7); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7] } -// CHECK-LABEL: test_vduph_lane_p16 +// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] poly16_t test_vduph_lane_p16(poly16x4_t a) { return vduph_lane_p16(a, 3); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vdupb_laneq_p8 +// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] poly8_t test_vdupb_laneq_p8(poly8x16_t a) { return vdupb_laneq_p8(a, 15); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15] } -// CHECK-LABEL: test_vduph_laneq_p16 +// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] poly16_t test_vduph_laneq_p16(poly16x8_t a) { return vduph_laneq_p16(a, 7); -// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7] } Index: test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c =================================================================== --- test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c +++ test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c @@ -1,259 +1,509 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: [[MUL:%.*]] = fmul float %a, [[VGET_LANE]] +// CHECK: ret float [[MUL]] float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) { - // CHECK-LABEL: test_vmuls_lane_f32 return vmuls_lane_f32(a, b, 1); - // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[MUL:%.*]] = fmul double %a, [[VGET_LANE]] +// CHECK: ret double [[MUL]] float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) { - // CHECK-LABEL: test_vmuld_lane_f64 return vmuld_lane_f64(a, b, 0); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } +// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]] +// CHECK: ret float [[MUL]] float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) { - // CHECK-LABEL: test_vmuls_laneq_f32 return vmuls_laneq_f32(a, b, 3); - // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]] +// CHECK: ret double [[MUL]] float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) { - // CHECK-LABEL: test_vmuld_laneq_f64 return vmuld_laneq_f64(a, b, 1); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double +// CHECK: [[TMP3:%.*]] = fmul double [[TMP2]], %b +// CHECK: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> +// CHECK: ret <1 x double> [[TMP4]] float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) { - // CHECK-LABEL: test_vmul_n_f64 return vmul_n_f64(a, b); - // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } +// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2 +// CHECK: ret float [[VMULXS_F32_I]] float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) { -// CHECK-LABEL: test_vmulxs_lane_f32 return vmulxs_lane_f32(a, b, 1); -// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } +// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2 +// CHECK: ret float [[VMULXS_F32_I]] float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) { -// CHECK-LABEL: test_vmulxs_laneq_f32 return vmulxs_laneq_f32(a, b, 3); -// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } +// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2 +// CHECK: ret double [[VMULXD_F64_I]] float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) { -// CHECK-LABEL: test_vmulxd_lane_f64 return vmulxd_lane_f64(a, b, 0); -// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } +// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2 +// CHECK: ret double [[VMULXD_F64_I]] float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { -// CHECK-LABEL: test_vmulxd_laneq_f64 return vmulxd_laneq_f64(a, b, 1); -// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vmulx_lane_f64 +// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]6) #2 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0 +// CHECK: ret <1 x double> [[VSET_LANE]] float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) { return vmulx_lane_f64(a, b, 0); - // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } -// CHECK-LABEL: test_vmulx_laneq_f64_0 +// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0 +// CHECK: ret <1 x double> [[VSET_LANE]] float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 0); - // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] } -// CHECK-LABEL: test_vmulx_laneq_f64_1 +// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0 +// CHECK: ret <1 x double> [[VSET_LANE]] float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 1); - // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vfmas_lane_f32 +// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) +// CHECK: ret float [[TMP2]] float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmas_lane_f32(a, b, c, 1); - // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vfmad_lane_f64 +// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) +// CHECK: ret double [[TMP2]] float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); - // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } -// CHECK-LABEL: test_vfmad_laneq_f64 +// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) +// CHECK: ret double [[TMP2]] float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { return vfmad_laneq_f64(a, b, c, 1); - // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] } -// CHECK-LABEL: test_vfmss_lane_f32 +// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x float> , %c +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) +// CHECK: ret float [[TMP2]] float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmss_lane_f32(a, b, c, 1); - // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vfma_lane_f64 +// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1) +// CHECK: ret <1 x double> [[FMLA]]2 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, 0); - // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } -// CHECK-LABEL: test_vfms_lane_f64 +// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <1 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA]]1) +// CHECK: ret <1 x double> [[FMLA]]2 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, 0); - // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}} } -// CHECK-LABEL: test_vfma_laneq_f64 +// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) +// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK: ret <1 x double> [[TMP7]] float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); - // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] } -// CHECK-LABEL: test_vfms_laneq_f64 +// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +// CHECK: [[SUB:%.*]] = fsub <2 x double> , %v +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) +// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK: ret <1 x double> [[TMP7]] float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); - // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] } -// CHECK-LABEL: test_vqdmullh_lane_s16 +// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 +// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK: ret i32 [[TMP4]] int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) { return vqdmullh_lane_s16(a, b, 3); - // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vqdmulls_lane_s32 +// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2 +// CHECK: ret i64 [[VQDMULLS_S32_I]] int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) { return vqdmulls_lane_s32(a, b, 1); - // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vqdmullh_laneq_s16 +// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 +// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK: ret i32 [[TMP4]] int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) { return vqdmullh_laneq_s16(a, b, 7); - // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vqdmulls_laneq_s32 +// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2 +// CHECK: ret i64 [[VQDMULLS_S32_I]] int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) { return vqdmulls_laneq_s32(a, b, 3); - // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vqdmulhh_lane_s16 +// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 +// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP4]] int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqdmulhh_lane_s16(a, b, 3); -// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vqdmulhs_lane_s32 +// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2 +// CHECK: ret i32 [[VQDMULHS_S32_I]] int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqdmulhs_lane_s32(a, b, 1); -// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vqdmulhh_laneq_s16 +// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 +// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP4]] int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqdmulhh_laneq_s16(a, b, 7); -// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vqdmulhs_laneq_s32 +// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2 +// CHECK: ret i32 [[VQDMULHS_S32_I]] int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqdmulhs_laneq_s32(a, b, 3); -// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vqrdmulhh_lane_s16 +// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 +// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP4]] int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqrdmulhh_lane_s16(a, b, 3); -// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vqrdmulhs_lane_s32 +// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2 +// CHECK: ret i32 [[VQRDMULHS_S32_I]] int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqrdmulhs_lane_s32(a, b, 1); -// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vqrdmulhh_laneq_s16 +// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 +// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2 +// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK: ret i16 [[TMP4]] int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqrdmulhh_laneq_s16(a, b, 7); -// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vqrdmulhs_laneq_s32 +// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2 +// CHECK: ret i32 [[VQRDMULHS_S32_I]] int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqrdmulhs_laneq_s32(a, b, 3); -// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vqdmlalh_lane_s16 +// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0 +// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) +// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0) +// CHECK: ret i32 [[VQDMLXL]]1 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlalh_lane_s16(a, b, c, 3); -// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vqdmlals_lane_s32 +// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) +// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) +// CHECK: ret i64 [[VQDMLXL]]1 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlals_lane_s32(a, b, c, 1); -// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vqdmlalh_laneq_s16 +// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0 +// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) +// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE]]0) +// CHECK: ret i32 [[VQDMLXL]]1 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlalh_laneq_s16(a, b, c, 7); -// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vqdmlals_laneq_s32 +// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) +// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) +// CHECK: ret i64 [[VQDMLXL]]1 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlals_laneq_s32(a, b, c, 3); -// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vqdmlslh_lane_s16 +// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0 +// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) +// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0) +// CHECK: ret i32 [[VQDMLXL]]1 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlslh_lane_s16(a, b, c, 3); -// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3] } -// CHECK-LABEL: test_vqdmlsls_lane_s32 +// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) +// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) +// CHECK: ret i64 [[VQDMLXL]]1 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlsls_lane_s32(a, b, c, 1); -// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] } -// CHECK-LABEL: test_vqdmlslh_laneq_s16 +// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0 +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0 +// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) +// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE]]0) +// CHECK: ret i32 [[VQDMLXL]]1 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlslh_laneq_s16(a, b, c, 7); -// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7] } -// CHECK-LABEL: test_vqdmlsls_laneq_s32 +// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) +// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) +// CHECK: ret i64 [[VQDMLXL]]1 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlsls_laneq_s32(a, b, c, 3); -// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] } -// CHECK-LABEL: test_vmulx_lane_f64_0: +// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> +// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE]]7) #2 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0 +// CHECK: ret <1 x double> [[VSET_LANE]] float64x1_t test_vmulx_lane_f64_0() { float64x1_t arg1; float64x1_t arg2; @@ -262,15 +512,24 @@ arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2)); arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3)); result = vmulx_lane_f64(arg1, arg2, 0); -// CHECK: adrp x[[ADDRLO:[0-9]+]] -// CHECK: ldr d0, [x[[ADDRLO]], -// CHECK: adrp x[[ADDRLO:[0-9]+]] -// CHECK: ldr d1, [x[[ADDRLO]], -// CHECK: fmulx d0, d1, d0 return result; } -// CHECK-LABEL: test_vmulx_laneq_f64_2: +// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> +// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2 +// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0 +// CHECK: ret <1 x double> [[VSET_LANE]] float64x1_t test_vmulx_laneq_f64_2() { float64x1_t arg1; float64x1_t arg2; @@ -281,10 +540,5 @@ arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3)); arg3 = vcombine_f64(arg1, arg2); result = vmulx_laneq_f64(arg1, arg3, 1); -// CHECK: adrp x[[ADDRLO:[0-9]+]] -// CHECK: ldr d0, [x[[ADDRLO]], -// CHECK: adrp x[[ADDRLO:[0-9]+]] -// CHECK: ldr d1, [x[[ADDRLO]], -// CHECK: fmulx d0, d1, d0 return result; } Index: test/CodeGen/aarch64-neon-shifts.c =================================================================== --- test/CodeGen/aarch64-neon-shifts.c +++ test/CodeGen/aarch64-neon-shifts.c @@ -1,6 +1,5 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s +// RUN: -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s #include @@ -25,19 +24,20 @@ uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_shift_vsra // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, - // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a + // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]] return vsra_n_u8(a, b, 5); } int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) { // CHECK-LABEL: test_shift_vsra_smax // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, - // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a + // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]] return vsra_n_s8(a, b, 8); } uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_shift_vsra_umax - // CHECK: ret <8 x i8> %a + // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer + // CHECK: ret <8 x i8> [[RES]] return vsra_n_u8(a, b, 8); } Index: test/CodeGen/aarch64-neon-tbl.c =================================================================== --- test/CodeGen/aarch64-neon-tbl.c +++ test/CodeGen/aarch64-neon-tbl.c @@ -1,463 +1,1500 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL11_I]] int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) { - // CHECK-LABEL: test_vtbl1_s8 return vtbl1_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL1_I]] int8x8_t test_vqtbl1_s8(int8x16_t a, int8x8_t b) { - // CHECK-LABEL: test_vqtbl1_s8 return vqtbl1_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL13_I]] int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) { - // CHECK-LABEL: test_vtbl2_s8 return vtbl2_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL2_I]] int8x8_t test_vqtbl2_s8(int8x16x2_t a, int8x8_t b) { - // CHECK-LABEL: test_vqtbl2_s8 return vqtbl2_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL26_I]] int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) { - // CHECK-LABEL: test_vtbl3_s8 return vtbl3_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL3_I]] int8x8_t test_vqtbl3_s8(int8x16x3_t a, int8x8_t b) { - // CHECK-LABEL: test_vqtbl3_s8 return vqtbl3_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL28_I]] int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) { - // CHECK-LABEL: test_vtbl4_s8 return vtbl4_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL4_I]] int8x8_t test_vqtbl4_s8(int8x16x4_t a, int8x8_t b) { - // CHECK-LABEL: test_vqtbl4_s8 return vqtbl4_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL1_I]] int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) { - // CHECK-LABEL: test_vqtbl1q_s8 return vqtbl1q_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL2_I]] int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) { - // CHECK-LABEL: test_vqtbl2q_s8 return vqtbl2q_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL3_I]] int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) { - // CHECK-LABEL: test_vqtbl3q_s8 return vqtbl3q_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL4_I]] int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) { - // CHECK-LABEL: test_vqtbl4q_s8 return vqtbl4q_s8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) { - // CHECK-LABEL: test_vtbx1_s8 return vtbx1_s8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX13_I]] int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) { - // CHECK-LABEL: test_vtbx2_s8 return vtbx2_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) { - // CHECK-LABEL: test_vtbx3_s8 return vtbx3_s8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX28_I]] int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) { - // CHECK-LABEL: test_vtbx4_s8 return vtbx4_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX1_I]] int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, int8x8_t c) { - // CHECK-LABEL: test_vqtbx1_s8 return vqtbx1_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX2_I]] int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, int8x8_t c) { - // CHECK-LABEL: test_vqtbx2_s8 return vqtbx2_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX3_I]] int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, int8x8_t c) { - // CHECK-LABEL: test_vqtbx3_s8 return vqtbx3_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX4_I]] int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, int8x8_t c) { - // CHECK-LABEL: test_vqtbx4_s8 return vqtbx4_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX1_I]] int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, int8x16_t c) { - // CHECK-LABEL: test_vqtbx1q_s8 return vqtbx1q_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX2_I]] int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) { - // CHECK-LABEL: test_vqtbx2q_s8 return vqtbx2q_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX3_I]] int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) { - // CHECK-LABEL: test_vqtbx3q_s8 return vqtbx3q_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX4_I]] int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) { - // CHECK-LABEL: test_vqtbx4q_s8 return vqtbx4q_s8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL11_I]] uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl1_u8 return vtbl1_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL1_I]] uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl1_u8 return vqtbl1_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL13_I]] uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl2_u8 return vtbl2_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL2_I]] uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl2_u8 return vqtbl2_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL26_I]] uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl3_u8 return vtbl3_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL3_I]] uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl3_u8 return vqtbl3_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL28_I]] uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl4_u8 return vtbl4_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL4_I]] uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl4_u8 return vqtbl4_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL1_I]] uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl1q_u8 return vqtbl1q_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL2_I]] uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl2q_u8 return vqtbl2q_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL3_I]] uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl3q_u8 return vqtbl3q_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL4_I]] uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl4q_u8 return vqtbl4q_u8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx1_u8 return vtbx1_u8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX13_I]] uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx2_u8 return vtbx2_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx3_u8 return vtbx3_u8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX28_I]] uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx4_u8 return vtbx4_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX1_I]] uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx1_u8 return vqtbx1_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX2_I]] uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx2_u8 return vqtbx2_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX3_I]] uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx3_u8 return vqtbx3_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX4_I]] uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx4_u8 return vqtbx4_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX1_I]] uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx1q_u8 return vqtbx1q_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX2_I]] uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx2q_u8 return vqtbx2q_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX3_I]] uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx3q_u8 return vqtbx3q_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX4_I]] uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx4q_u8 return vqtbx4q_u8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL11_I]] poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl1_p8 return vtbl1_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL1_I]] poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl1_p8 return vqtbl1_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL13_I]] poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl2_p8 return vtbl2_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL2_I]] poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl2_p8 return vqtbl2_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL26_I]] poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl3_p8 return vtbl3_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL3_I]] poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl3_p8 return vqtbl3_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL28_I]] poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) { - // CHECK-LABEL: test_vtbl4_p8 return vtbl4_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2 +// CHECK: ret <8 x i8> [[VTBL4_I]] poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) { - // CHECK-LABEL: test_vqtbl4_p8 return vqtbl4_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL1_I]] poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl1q_p8 return vqtbl1q_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL2_I]] poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl2q_p8 return vqtbl2q_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL3_I]] poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl3q_p8 return vqtbl3q_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx.i, align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx2.i, align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx4.i, align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[A]]rrayidx6.i, align 16 +// CHECK: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2 +// CHECK: ret <16 x i8> [[VTBL4_I]] poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) { - // CHECK-LABEL: test_vqtbl4q_p8 return vqtbl4q_p8(a, b); - // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx1_p8 return vtbx1_p8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX13_I]] poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx2_p8 return vtbx2_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx3_p8 return vtbx3_p8(a, b, c); - // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0 - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] - // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b - // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX28_I]] poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) { - // CHECK-LABEL: test_vtbx4_p8 return vtbx4_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX1_I]] poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx1_p8 return vqtbx1_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX2_I]] poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx2_p8 return vqtbx2_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX3_I]] poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx3_p8 return vqtbx3_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2 +// CHECK: ret <8 x i8> [[VTBX4_I]] poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) { - // CHECK-LABEL: test_vqtbx4_p8 return vqtbx4_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX1_I]] poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx1q_p8 return vqtbx1q_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX2_I]] poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx2q_p8 return vqtbx2q_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX3_I]] poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx3q_p8 return vqtbx3q_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } +// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE]]1, align 16 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE]].i, align 16 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16 +// CHECK: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2 +// CHECK: ret <16 x i8> [[VTBX4_I]] poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) { - // CHECK-LABEL: test_vqtbx4q_p8 return vqtbx4q_p8(a, b, c); - // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b } Index: test/CodeGen/aarch64-neon-vcombine.c =================================================================== --- test/CodeGen/aarch64-neon-vcombine.c +++ test/CodeGen/aarch64-neon-vcombine.c @@ -1,90 +1,103 @@ -// REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %low, <8 x i8> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vcombine_s8(int8x8_t low, int8x8_t high) { - // CHECK-LABEL: test_vcombine_s8: return vcombine_s8(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %low, <4 x i16> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vcombine_s16(int16x4_t low, int16x4_t high) { - // CHECK-LABEL: test_vcombine_s16: return vcombine_s16(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) { - // CHECK-LABEL: test_vcombine_s32: return vcombine_s32(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %low, <1 x i64> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vcombine_s64(int64x1_t low, int64x1_t high) { - // CHECK-LABEL: test_vcombine_s64: return vcombine_s64(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) { - // CHECK-LABEL: test_vcombine_u8: return vcombine_u8(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) { - // CHECK-LABEL: test_vcombine_u16: return vcombine_u16(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %low, <2 x i32> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vcombine_u32(uint32x2_t low, uint32x2_t high) { - // CHECK-LABEL: test_vcombine_u32: return vcombine_u32(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %low, <1 x i64> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vcombine_u64(uint64x1_t low, uint64x1_t high) { - // CHECK-LABEL: test_vcombine_u64: return vcombine_u64(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) { - // CHECK-LABEL: test_vcombine_p64: return vcombine_p64(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %low, <4 x half> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %low, <4 x half> %high, <8 x i32> +// CHECK: ret <8 x half> [[SHUFFLE_I]] float16x8_t test_vcombine_f16(float16x4_t low, float16x4_t high) { - // CHECK-LABEL: test_vcombine_f16: return vcombine_f16(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %low, <2 x float> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %low, <2 x float> %high, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vcombine_f32(float32x2_t low, float32x2_t high) { - // CHECK-LABEL: test_vcombine_f32: return vcombine_f32(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %low, <8 x i8> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vcombine_p8(poly8x8_t low, poly8x8_t high) { - // CHECK-LABEL: test_vcombine_p8: return vcombine_p8(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %low, <4 x i16> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vcombine_p16(poly16x4_t low, poly16x4_t high) { - // CHECK-LABEL: test_vcombine_p16: return vcombine_p16(low, high); - // CHECK: ins v0.d[1], v1.d[0] } +// CHECK-LABEL: define <2 x double> @test_vcombine_f64(<1 x double> %low, <1 x double> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> %low, <1 x double> %high, <2 x i32> +// CHECK: ret <2 x double> [[SHUFFLE_I]] float64x2_t test_vcombine_f64(float64x1_t low, float64x1_t high) { - // CHECK-LABEL: test_vcombine_f64: return vcombine_f64(low, high); - // CHECK: ins v0.d[1], v1.d[0] } Index: test/CodeGen/aarch64-neon-vget-hilo.c =================================================================== --- test/CodeGen/aarch64-neon-vget-hilo.c +++ test/CodeGen/aarch64-neon-vget-hilo.c @@ -1,176 +1,203 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64 - +// RUN: -fallow-half-arguments-and-returns -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck %s // Test new aarch64 intrinsics and types #include +// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vget_high_s8(int8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_high_s8: return vget_high_s8(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vget_high_s16(int16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_high_s16: return vget_high_s16(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vget_high_s32(int32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_high_s32: return vget_high_s32(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: ret <1 x i64> [[SHUFFLE_I]] int64x1_t test_vget_high_s64(int64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_high_s64: return vget_high_s64(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vget_high_u8(uint8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_high_u8: return vget_high_u8(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vget_high_u16(uint16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_high_u16: return vget_high_u16(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vget_high_u32(uint32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_high_u32: return vget_high_u32(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: ret <1 x i64> [[SHUFFLE_I]] uint64x1_t test_vget_high_u64(uint64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_high_u64: return vget_high_u64(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: ret <1 x i64> [[SHUFFLE_I]] poly64x1_t test_vget_high_p64(poly64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_high_p64: return vget_high_p64(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> +// CHECK: ret <4 x half> [[SHUFFLE_I]] float16x4_t test_vget_high_f16(float16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_high_f16: return vget_high_f16(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vget_high_f32(float32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_high_f32: return vget_high_f32(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vget_high_p8(poly8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_high_p8: return vget_high_p8(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vget_high_p16(poly16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_high_p16 return vget_high_p16(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <1 x double> @test_vget_high_f64(<2 x double> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> +// CHECK: ret <1 x double> [[SHUFFLE_I]] float64x1_t test_vget_high_f64(float64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_high_f64 return vget_high_f64(a); - // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 } +// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vget_low_s8(int8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_low_s8: return vget_low_s8(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vget_low_s16(int16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_low_s16: return vget_low_s16(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vget_low_s32(int32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_low_s32: return vget_low_s32(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE_I]] int64x1_t test_vget_low_s64(int64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_low_s64: return vget_low_s64(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vget_low_u8(uint8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_low_u8: return vget_low_u8(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vget_low_u16(uint16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_low_u16: return vget_low_u16(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vget_low_u32(uint32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_low_u32: return vget_low_u32(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE_I]] uint64x1_t test_vget_low_u64(uint64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_low_u64: return vget_low_u64(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <1 x i64> @test_vget_low_p64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE_I]] poly64x1_t test_vget_low_p64(poly64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_low_p64: return vget_low_p64(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> +// CHECK: ret <4 x half> [[SHUFFLE_I]] float16x4_t test_vget_low_f16(float16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_low_f16: return vget_low_f16(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vget_low_f32(float32x4_t a) { - // CHECK-COMMON-LABEL: test_vget_low_f32: return vget_low_f32(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vget_low_p8(poly8x16_t a) { - // CHECK-COMMON-LABEL: test_vget_low_p8: return vget_low_p8(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vget_low_p16(poly16x8_t a) { - // CHECK-COMMON-LABEL: test_vget_low_p16: return vget_low_p16(a); - // CHECK-COMMON-NEXT: ret } +// CHECK-LABEL: define <1 x double> @test_vget_low_f64(<2 x double> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x double> [[SHUFFLE_I]] float64x1_t test_vget_low_f64(float64x2_t a) { - // CHECK-COMMON-LABEL: test_vget_low_f64: return vget_low_f64(a); - // CHECK-COMMON-NEXT: ret } Index: test/CodeGen/aarch64-neon-vget.c =================================================================== --- test/CodeGen/aarch64-neon-vget.c +++ test/CodeGen/aarch64-neon-vget.c @@ -1,348 +1,458 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: -fallow-half-arguments-and-returns -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] uint8_t test_vget_lane_u8(uint8x8_t a) { - // CHECK-LABEL: test_vget_lane_u8: - // CHECK-NEXT: umov.b w0, v0[7] - // CHECK-NEXT: ret return vget_lane_u8(a, 7); } +// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] uint16_t test_vget_lane_u16(uint16x4_t a) { - // CHECK-LABEL: test_vget_lane_u16: - // CHECK-NEXT: umov.h w0, v0[3] - // CHECK-NEXT: ret return vget_lane_u16(a, 3); } +// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] uint32_t test_vget_lane_u32(uint32x2_t a) { - // CHECK-LABEL: test_vget_lane_u32: - // CHECK-NEXT: mov.s w0, v0[1] - // CHECK-NEXT: ret return vget_lane_u32(a, 1); } +// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] int8_t test_vget_lane_s8(int8x8_t a) { - // CHECK-LABEL: test_vget_lane_s8: - // CHECK-NEXT: umov.b w0, v0[7] - // CHECK-NEXT: ret return vget_lane_s8(a, 7); } +// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] int16_t test_vget_lane_s16(int16x4_t a) { - // CHECK-LABEL: test_vget_lane_s16: - // CHECK-NEXT: umov.h w0, v0[3] - // CHECK-NEXT: ret return vget_lane_s16(a, 3); } +// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] int32_t test_vget_lane_s32(int32x2_t a) { - // CHECK-LABEL: test_vget_lane_s32: - // CHECK-NEXT: mov.s w0, v0[1] - // CHECK-NEXT: ret return vget_lane_s32(a, 1); } +// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] poly8_t test_vget_lane_p8(poly8x8_t a) { - // CHECK-LABEL: test_vget_lane_p8: - // CHECK-NEXT: umov.b w0, v0[7] - // CHECK-NEXT: ret return vget_lane_p8(a, 7); } +// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] poly16_t test_vget_lane_p16(poly16x4_t a) { - // CHECK-LABEL: test_vget_lane_p16: - // CHECK-NEXT: umov.h w0, v0[3] - // CHECK-NEXT: ret return vget_lane_p16(a, 3); } +// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: ret float [[VGET_LANE]] float32_t test_vget_lane_f32(float32x2_t a) { - // CHECK-LABEL: test_vget_lane_f32: - // CHECK-NEXT: mov s0, v0[1] - // CHECK-NEXT: ret return vget_lane_f32(a, 1); } +// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 { +// CHECK: [[__REINT_242:%.*]] = alloca <4 x half>, align 8 +// CHECK: [[__REINT1_242:%.*]] = alloca i16, align 2 +// CHECK: store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>* +// CHECK: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1 +// CHECK: store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2 +// CHECK: [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half* +// CHECK: [[TMP5:%.*]] = load half, half* [[TMP4]], align 2 +// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float +// CHECK: ret float [[CONV]] float32_t test_vget_lane_f16(float16x4_t a) { - // CHECK-LABEL: test_vget_lane_f16: - // CHECK-NEXT: umov.h w8, v0[1] - // CHECK-NEXT: fmov s0, w8 - // CHECK-NEXT: fcvt s0, h0 - // CHECK-NEXT: ret return vget_lane_f16(a, 1); } +// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] uint8_t test_vgetq_lane_u8(uint8x16_t a) { - // CHECK-LABEL: test_vgetq_lane_u8: - // CHECK-NEXT: umov.b w0, v0[15] - // CHECK-NEXT: ret return vgetq_lane_u8(a, 15); } +// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] uint16_t test_vgetq_lane_u16(uint16x8_t a) { - // CHECK-LABEL: test_vgetq_lane_u16: - // CHECK-NEXT: umov.h w0, v0[7] - // CHECK-NEXT: ret return vgetq_lane_u16(a, 7); } +// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] uint32_t test_vgetq_lane_u32(uint32x4_t a) { - // CHECK-LABEL: test_vgetq_lane_u32: - // CHECK-NEXT: mov.s w0, v0[3] - // CHECK-NEXT: ret return vgetq_lane_u32(a, 3); } +// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] int8_t test_vgetq_lane_s8(int8x16_t a) { - // CHECK-LABEL: test_vgetq_lane_s8: - // CHECK-NEXT: umov.b w0, v0[15] - // CHECK-NEXT: ret return vgetq_lane_s8(a, 15); } +// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] int16_t test_vgetq_lane_s16(int16x8_t a) { - // CHECK-LABEL: test_vgetq_lane_s16: - // CHECK-NEXT: umov.h w0, v0[7] - // CHECK-NEXT: ret return vgetq_lane_s16(a, 7); } +// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] int32_t test_vgetq_lane_s32(int32x4_t a) { - // CHECK-LABEL: test_vgetq_lane_s32: - // CHECK-NEXT: mov.s w0, v0[3] - // CHECK-NEXT: ret return vgetq_lane_s32(a, 3); } +// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] poly8_t test_vgetq_lane_p8(poly8x16_t a) { - // CHECK-LABEL: test_vgetq_lane_p8: - // CHECK-NEXT: umov.b w0, v0[15] - // CHECK-NEXT: ret return vgetq_lane_p8(a, 15); } +// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] poly16_t test_vgetq_lane_p16(poly16x8_t a) { - // CHECK-LABEL: test_vgetq_lane_p16: - // CHECK-NEXT: umov.h w0, v0[7] - // CHECK-NEXT: ret return vgetq_lane_p16(a, 7); } +// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: ret float [[VGETQ_LANE]] float32_t test_vgetq_lane_f32(float32x4_t a) { - // CHECK-LABEL: test_vgetq_lane_f32: - // CHECK-NEXT: mov s0, v0[3] - // CHECK-NEXT: ret return vgetq_lane_f32(a, 3); } +// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 { +// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16 +// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2 +// CHECK: store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>* +// CHECK: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +// CHECK: store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2 +// CHECK: [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half* +// CHECK: [[TMP5:%.*]] = load half, half* [[TMP4]], align 2 +// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float +// CHECK: ret float [[CONV]] float32_t test_vgetq_lane_f16(float16x8_t a) { - // CHECK-LABEL: test_vgetq_lane_f16: - // CHECK-NEXT: umov.h w8, v0[3] - // CHECK-NEXT: fmov s0, w8 - // CHECK-NEXT: fcvt s0, h0 - // CHECK-NEXT: ret return vgetq_lane_f16(a, 3); } +// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] int64_t test_vget_lane_s64(int64x1_t a) { - // CHECK-LABEL: test_vget_lane_s64: - // CHECK-NEXT: fmov x0, d0 - // CHECK-NEXT: ret return vget_lane_s64(a, 0); } +// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] uint64_t test_vget_lane_u64(uint64x1_t a) { - // CHECK-LABEL: test_vget_lane_u64: - // CHECK-NEXT: fmov x0, d0 - // CHECK-NEXT: ret return vget_lane_u64(a, 0); } +// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] int64_t test_vgetq_lane_s64(int64x2_t a) { - // CHECK-LABEL: test_vgetq_lane_s64: - // CHECK-NEXT: mov.d x0, v0[1] - // CHECK-NEXT: ret return vgetq_lane_s64(a, 1); } +// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] uint64_t test_vgetq_lane_u64(uint64x2_t a) { - // CHECK-LABEL: test_vgetq_lane_u64: - // CHECK-NEXT: mov.d x0, v0[1] - // CHECK-NEXT: ret return vgetq_lane_u64(a, 1); } +// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { - // CHECK-LABEL: test_vset_lane_u8: - // CHECK-NEXT: ins.b v0[7], w0 - // CHECK-NEXT: ret return vset_lane_u8(a, b, 7); } +// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { - // CHECK-LABEL: test_vset_lane_u16: - // CHECK-NEXT: ins.h v0[3], w0 - // CHECK-NEXT: ret return vset_lane_u16(a, b, 3); } +// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VSET_LANE]] uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { - // CHECK-LABEL: test_vset_lane_u32: - // CHECK-NEXT: ins.s v0[1], w0 - // CHECK-NEXT: ret return vset_lane_u32(a, b, 1); } +// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) { - // CHECK-LABEL: test_vset_lane_s8: - // CHECK-NEXT: ins.b v0[7], w0 - // CHECK-NEXT: ret return vset_lane_s8(a, b, 7); } +// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) { - // CHECK-LABEL: test_vset_lane_s16: - // CHECK-NEXT: ins.h v0[3], w0 - // CHECK-NEXT: ret return vset_lane_s16(a, b, 3); } +// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VSET_LANE]] int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) { - // CHECK-LABEL: test_vset_lane_s32: - // CHECK-NEXT: ins.s v0[1], w0 - // CHECK-NEXT: ret return vset_lane_s32(a, b, 1); } +// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) { - // CHECK-LABEL: test_vset_lane_p8: - // CHECK-NEXT: ins.b v0[7], w0 - // CHECK-NEXT: ret return vset_lane_p8(a, b, 7); } +// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) { - // CHECK-LABEL: test_vset_lane_p16: - // CHECK-NEXT: ins.h v0[3], w0 - // CHECK-NEXT: ret return vset_lane_p16(a, b, 3); } +// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1 +// CHECK: ret <2 x float> [[VSET_LANE]] float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { - // CHECK-LABEL: test_vset_lane_f32: - // CHECK-NEXT: ins.s v1[1], v0[0] - // CHECK-NEXT: mov.16b v0, v1 - // CHECK-NEXT: ret return vset_lane_f32(a, b, 1); } +// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[__REINT_246:%.*]] = alloca half, align 2 +// CHECK: [[__REINT1_246:%.*]] = alloca <4 x half>, align 8 +// CHECK: [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8 +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: store half [[TMP0]], half* [[__REINT_246]], align 2 +// CHECK: store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8 +// CHECK: [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 3 +// CHECK: store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>* +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8 +// CHECK: ret <4 x half> [[TMP8]] float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) { - // CHECK-LABEL: test_vset_lane_f16: - // CHECK-NEXT: ld1.h { v0 }[3], [x0] - // CHECK-NEXT: ret return vset_lane_f16(*a, b, 3); } +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { - // CHECK-LABEL: test_vsetq_lane_u8: - // CHECK-NEXT: ins.b v0[15], w0 - // CHECK-NEXT: ret return vsetq_lane_u8(a, b, 15); } +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { - // CHECK-LABEL: test_vsetq_lane_u16: - // CHECK-NEXT: ins.h v0[7], w0 - // CHECK-NEXT: ret return vsetq_lane_u16(a, b, 7); } +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { - // CHECK-LABEL: test_vsetq_lane_u32: - // CHECK-NEXT: ins.s v0[3], w0 - // CHECK-NEXT: ret return vsetq_lane_u32(a, b, 3); } +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) { - // CHECK-LABEL: test_vsetq_lane_s8: - // CHECK-NEXT: ins.b v0[15], w0 - // CHECK-NEXT: ret return vsetq_lane_s8(a, b, 15); } +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) { - // CHECK-LABEL: test_vsetq_lane_s16: - // CHECK-NEXT: ins.h v0[7], w0 - // CHECK-NEXT: ret return vsetq_lane_s16(a, b, 7); } +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) { - // CHECK-LABEL: test_vsetq_lane_s32: - // CHECK-NEXT: ins.s v0[3], w0 - // CHECK-NEXT: ret return vsetq_lane_s32(a, b, 3); } +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) { - // CHECK-LABEL: test_vsetq_lane_p8: - // CHECK-NEXT: ins.b v0[15], w0 - // CHECK-NEXT: ret return vsetq_lane_p8(a, b, 15); } +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) { - // CHECK-LABEL: test_vsetq_lane_p16: - // CHECK-NEXT: ins.h v0[7], w0 - // CHECK-NEXT: ret return vsetq_lane_p16(a, b, 7); } +// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3 +// CHECK: ret <4 x float> [[VSET_LANE]] float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { - // CHECK-LABEL: test_vsetq_lane_f32: - // CHECK-NEXT: ins.s v1[3], v0[0] - // CHECK-NEXT: mov.16b v0, v1 - // CHECK-NEXT: ret return vsetq_lane_f32(a, b, 3); } +// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[__REINT_248:%.*]] = alloca half, align 2 +// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16 +// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16 +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: store half [[TMP0]], half* [[__REINT_248]], align 2 +// CHECK: store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16 +// CHECK: [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 7 +// CHECK: store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>* +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16 +// CHECK: ret <8 x half> [[TMP8]] float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) { - // CHECK-LABEL: test_vsetq_lane_f16: - // CHECK-NEXT: ld1.h { v0 }[7], [x0] - // CHECK-NEXT: ret return vsetq_lane_f16(*a, b, 7); } +// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) { - // CHECK-LABEL: test_vset_lane_s64: - // CHECK-NEXT: fmov d0, x0 - // CHECK-NEXT: ret return vset_lane_s64(a, b, 0); } +// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) { - // CHECK-LABEL: test_vset_lane_u64: - // CHECK-NEXT: fmov d0, x0 - // CHECK-NEXT: ret return vset_lane_u64(a, b, 0); } +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { - // CHECK-LABEL: test_vsetq_lane_s64: - // CHECK-NEXT: ins.d v0[1], x0 - // CHECK-NEXT: ret return vsetq_lane_s64(a, b, 1); } +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) { - // CHECK-LABEL: test_vsetq_lane_u64: - // CHECK-NEXT: ins.d v0[1], x0 - // CHECK-NEXT: ret return vsetq_lane_u64(a, b, 1); } Index: test/CodeGen/aarch64-poly128.c =================================================================== --- test/CodeGen/aarch64-poly128.c +++ test/CodeGen/aarch64-poly128.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \ -// RUN: --check-prefix=CHECK-ARM64 +// RUN: -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \ +// RUN: | FileCheck %s // Test new aarch64 intrinsics with poly128 // FIXME: Currently, poly128_t equals to uint128, which will be spilt into @@ -12,192 +12,238 @@ #include +// CHECK-LABEL: define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128* +// CHECK: store i128 %val, i128* [[TMP1]] +// CHECK: ret void void test_vstrq_p128(poly128_t * ptr, poly128_t val) { - // CHECK-LABEL: test_vstrq_p128 vstrq_p128(ptr, val); - // CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [x0] } +// CHECK-LABEL: define i128 @test_vldrq_p128(i128* %ptr) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128* +// CHECK: [[TMP2:%.*]] = load i128, i128* [[TMP1]] +// CHECK: ret i128 [[TMP2]] poly128_t test_vldrq_p128(poly128_t * ptr) { - // CHECK-LABEL: test_vldrq_p128 return vldrq_p128(ptr); - // CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [x0] } +// CHECK-LABEL: define void @test_ld_st_p128(i128* %ptr) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128* +// CHECK: [[TMP2:%.*]] = load i128, i128* [[TMP1]] +// CHECK: [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1 +// CHECK: [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128* +// CHECK: store i128 [[TMP2]], i128* [[TMP4]] +// CHECK: ret void void test_ld_st_p128(poly128_t * ptr) { - // CHECK-LABEL: test_ld_st_p128 vstrq_p128(ptr+1, vldrq_p128(ptr)); - // CHECK-ARM64: ldp [[PLO:x[0-9]+]], [[PHI:x[0-9]+]], [{{x[0-9]+}}] - // CHECK-ARM64-NEXT: stp [[PLO]], [[PHI]], [{{x[0-9]+}}, #16] } +// CHECK-LABEL: define i128 @test_vmull_p64(i64 %a, i64 %b) #0 { +// CHECK: [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #2 +// CHECK: [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128 +// CHECK: ret i128 [[VMULL_P641_I]] poly128_t test_vmull_p64(poly64_t a, poly64_t b) { - // CHECK-LABEL: test_vmull_p64 return vmull_p64(a, b); - // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d } +// CHECK-LABEL: define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64 +// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64 +// CHECK: [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #2 +// CHECK: [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128 +// CHECK: ret i128 [[VMULL_P641_I_I]] poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vmull_high_p64 return vmull_high_p64(a, b); - // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } -// CHECK-LABEL: test_vreinterpretq_p128_s8 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_s8(int8x16_t a) { return vreinterpretq_p128_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p128_s16 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_s16(int16x8_t a) { return vreinterpretq_p128_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p128_s32 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_s32(int32x4_t a) { return vreinterpretq_p128_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p128_s64 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_s64(int64x2_t a) { return vreinterpretq_p128_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p128_u8 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) { return vreinterpretq_p128_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p128_u16 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) { return vreinterpretq_p128_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p128_u32 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) { return vreinterpretq_p128_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p128_u64 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) { return vreinterpretq_p128_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p128_f32 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_f32(float32x4_t a) { return vreinterpretq_p128_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p128_f64 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_f64(float64x2_t a) { return vreinterpretq_p128_f64(a); } -// CHECK-LABEL: test_vreinterpretq_p128_p8 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) { return vreinterpretq_p128_p8(a); } -// CHECK-LABEL: test_vreinterpretq_p128_p16 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) { return vreinterpretq_p128_p16(a); } -// CHECK-LABEL: test_vreinterpretq_p128_p64 -// CHECK: ret +// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to i128 +// CHECK: ret i128 [[TMP0]] poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) { return vreinterpretq_p128_p64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p128 -// CHECK: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_p128(poly128_t a) { return vreinterpretq_s8_p128(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p128 -// CHECK: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_p128(poly128_t a) { return vreinterpretq_s16_p128(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p128 -// CHECK: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p128(poly128_t a) { return vreinterpretq_s32_p128(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p128 -// CHECK: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_p128(poly128_t a) { return vreinterpretq_s64_p128(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p128 -// CHECK: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_p128(poly128_t a) { return vreinterpretq_u8_p128(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p128 -// CHECK: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_p128(poly128_t a) { return vreinterpretq_u16_p128(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p128 -// CHECK: ret +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p128(poly128_t a) { return vreinterpretq_u32_p128(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p128 -// CHECK: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_p128(poly128_t a) { return vreinterpretq_u64_p128(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p128 -// CHECK: ret +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p128(poly128_t a) { return vreinterpretq_f32_p128(a); } -// CHECK-LABEL: test_vreinterpretq_f64_p128 -// CHECK: ret +// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <2 x double> +// CHECK: ret <2 x double> [[TMP0]] float64x2_t test_vreinterpretq_f64_p128(poly128_t a) { return vreinterpretq_f64_p128(a); } -// CHECK-LABEL: test_vreinterpretq_p8_p128 -// CHECK: ret +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_p128(poly128_t a) { return vreinterpretq_p8_p128(a); } -// CHECK-LABEL: test_vreinterpretq_p16_p128 -// CHECK: ret +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_p128(poly128_t a) { return vreinterpretq_p16_p128(a); } -// CHECK-LABEL: test_vreinterpretq_p64_p128 -// CHECK: ret +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i128 %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] poly64x2_t test_vreinterpretq_p64_p128(poly128_t a) { return vreinterpretq_p64_p128(a); } Index: test/CodeGen/aarch64-poly64.c =================================================================== --- test/CodeGen/aarch64-poly64.c +++ test/CodeGen/aarch64-poly64.c @@ -1,299 +1,634 @@ -// FIXME: This is a front-end test that depends on LLVM optimizations (-O3). -// It should be split into separate files for front/middle/back-end testing. - -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \ -// RUN: --check-prefix=CHECK-ARM64 +// RUN: -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \ +// RUN: | FileCheck %s // Test new aarch64 intrinsics with poly64 #include +// CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[SEXT_I]] uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) { - // CHECK-LABEL: test_vceq_p64 return vceq_p64(a, b); - // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[SEXT_I]] uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vceqq_p64 return vceqq_p64(a, b); - // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK: ret <1 x i64> [[VTST_I]] uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) { - // CHECK-LABEL: test_vtst_p64 return vtst_p64(a, b); - // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK: ret <2 x i64> [[VTST_I]] uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vtstq_p64 return vtstq_p64(a, b); - // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <1 x i64> [[VBSL5_I]] poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) { - // CHECK-LABEL: test_vbsl_p64 return vbsl_p64(a, b, c); - // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b } +// CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i64> [[VBSL5_I]] poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) { - // CHECK-LABEL: test_vbslq_p64 return vbslq_p64(a, b, c); - // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b } +// CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] poly64_t test_vget_lane_p64(poly64x1_t v) { - // CHECK-LABEL: test_vget_lane_p64 return vget_lane_p64(v, 0); - // CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} } +// CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] poly64_t test_vgetq_lane_p64(poly64x2_t v) { - // CHECK-LABEL: test_vgetq_lane_p64 return vgetq_lane_p64(v, 1); - // CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) { - // CHECK-LABEL: test_vset_lane_p64 return vset_lane_p64(a, v, 0); - // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} } +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) { - // CHECK-LABEL: test_vsetq_lane_p64 return vsetq_lane_p64(a, v, 1); - // CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) { - // CHECK-LABEL: test_vcopy_lane_p64 return vcopy_lane_p64(a, 0, b, 0); - // CHECK-ARM64: mov v0.16b, v1.16b } +// CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) { - // CHECK-LABEL: test_vcopyq_lane_p64 return vcopyq_lane_p64(a, 1, b, 0); - // CHECK: zip1 v0.2d, v0.2d, v1.2d } +// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vcopyq_laneq_p64 return vcopyq_laneq_p64(a, 1, b, 1); } +// CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] poly64x1_t test_vcreate_p64(uint64_t a) { - // CHECK-LABEL: test_vcreate_p64 return vcreate_p64(a); - // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VECINIT_I]] poly64x1_t test_vdup_n_p64(poly64_t a) { - // CHECK-LABEL: test_vdup_n_p64 return vdup_n_p64(a); - // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} } +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VECINIT1_I]] poly64x2_t test_vdupq_n_p64(poly64_t a) { - // CHECK-LABEL: test_vdupq_n_p64 return vdupq_n_p64(a); - // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VECINIT_I]] poly64x1_t test_vmov_n_p64(poly64_t a) { - // CHECK-LABEL: test_vmov_n_p64 return vmov_n_p64(a); - // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} } +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VECINIT1_I]] poly64x2_t test_vmovq_n_p64(poly64_t a) { - // CHECK-LABEL: test_vmovq_n_p64 return vmovq_n_p64(a); - // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} } +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE]] poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { - // CHECK-LABEL: test_vdup_lane_p64 return vdup_lane_p64(vec, 0); - // CHECK: ret } +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[SHUFFLE]] poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { - // CHECK-LABEL: test_vdupq_lane_p64 return vdupq_lane_p64(vec, 0); - // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE]] poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { - // CHECK-LABEL: test_vdupq_laneq_p64 return vdupq_laneq_p64(vec, 1); - // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] } +// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) { - // CHECK-LABEL: test_vcombine_p64 return vcombine_p64(low, high); - // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] } +// CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]] +// CHECK: ret <1 x i64> [[TMP2]] poly64x1_t test_vld1_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld1_p64 return vld1_p64(ptr); - // CHECK-ARM64: ldr {{d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]] +// CHECK: ret <2 x i64> [[TMP2]] poly64x2_t test_vld1q_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld1q_p64 return vld1q_p64(ptr); - // CHECK-ARM64: ldr {{q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1_p64(poly64_t * ptr, poly64x1_t val) { - // CHECK-LABEL: test_vst1_p64 return vst1_p64(ptr, val); - // CHECK-ARM64: str {{d[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) { - // CHECK-LABEL: test_vst1q_p64 return vst1q_p64(ptr, val); - // CHECK-ARM64: str {{q[0-9]+}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x2_t [[TMP6]] poly64x1x2_t test_vld2_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld2_p64 return vld2_p64(ptr); - // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x2_t [[TMP6]] poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld2q_p64 return vld2q_p64(ptr); - // CHECK: ld2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x3_t [[TMP6]] poly64x1x3_t test_vld3_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld3_p64 return vld3_p64(ptr); - // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x3_t [[TMP6]] poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld3q_p64 return vld3q_p64(ptr); - // CHECK: ld3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly64x1x4_t [[TMP6]] poly64x1x4_t test_vld4_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld4_p64 return vld4_p64(ptr); - // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>* +// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* +// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly64x2x4_t [[TMP6]] poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) { - // CHECK-LABEL: test_vld4q_p64 return vld4q_p64(ptr); - // CHECK: ld4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) { - // CHECK-LABEL: test_vst2_p64 return vst2_p64(ptr, val); - // CHECK: st1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0 +// CHECK: store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) { - // CHECK-LABEL: test_vst2q_p64 return vst2q_p64(ptr, val); - // CHECK: st2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]4, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) { - // CHECK-LABEL: test_vst3_p64 return vst3_p64(ptr, val); - // CHECK: st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0 +// CHECK: store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]4, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) { - // CHECK-LABEL: test_vst3q_p64 return vst3q_p64(ptr, val); - // CHECK: st3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]3, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]4, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]5, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]6, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]7, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) { - // CHECK-LABEL: test_vst4_p64 return vst4_p64(ptr, val); - // CHECK: st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 { +// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0 +// CHECK: store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8* +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK: [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]2, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]3, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> +// CHECK: [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]4, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]5, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> +// CHECK: [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]6, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]7, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) { - // CHECK-LABEL: test_vst4q_p64 return vst4q_p64(ptr, val); - // CHECK: st4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}] } +// CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[VEXT]] poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) { - // CHECK-LABEL: test_vext_p64 return vext_u64(a, b, 0); } +// CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i64> [[VEXT]] poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vextq_p64 return vextq_p64(a, b, 1); - // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{#0x8|#8}} } +// CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vzip1q_p64 return vzip1q_p64(a, b); - // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vzip2q_p64 return vzip2q_u64(a, b); - // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vuzp1q_p64 return vuzp1q_p64(a, b); - // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vuzp2q_p64 return vuzp2q_u64(a, b); - // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vtrn1q_p64 return vtrn1q_p64(a, b); - // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vtrn2q_p64 return vtrn2q_u64(a, b); - // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d } +// CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 33) +// CHECK: ret <1 x i64> [[VSRI_N]]2 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) { - // CHECK-LABEL: test_vsri_n_p64 return vsri_n_p64(a, b, 33); - // CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #33 } +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 64) +// CHECK: ret <2 x i64> [[VSRI_N]]2 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) { - // CHECK-LABEL: test_vsriq_n_p64 return vsriq_n_p64(a, b, 64); - // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #64 } Index: test/CodeGen/arm-bitfield-alignment.c =================================================================== --- test/CodeGen/arm-bitfield-alignment.c +++ test/CodeGen/arm-bitfield-alignment.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s +// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s extern struct T { int b0 : 8; Index: test/CodeGen/arm-crc32.c =================================================================== --- test/CodeGen/arm-crc32.c +++ test/CodeGen/arm-crc32.c @@ -1,6 +1,5 @@ -// REQUIRES: arm-registered-target // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi \ -// RUN: -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s int crc32b(int a, char b) { @@ -48,7 +47,7 @@ // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32 -// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]]) +// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]]) // CHECK: call i32 @llvm.arm.crc32w(i32 [[T3]], i32 [[T2]]) } @@ -58,6 +57,6 @@ // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32 -// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]]) +// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]]) // CHECK: call i32 @llvm.arm.crc32cw(i32 [[T3]], i32 [[T2]]) } Index: test/CodeGen/arm-neon-directed-rounding.c =================================================================== --- test/CodeGen/arm-neon-directed-rounding.c +++ test/CodeGen/arm-neon-directed-rounding.c @@ -1,75 +1,135 @@ -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) #2 +// CHECK: [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrnda_f32(float32x2_t a) { - // CHECK-LABEL: test_vrnda_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) return vrnda_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) #2 +// CHECK: [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndaq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndaq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) return vrndaq_f32(a); } +// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) #2 +// CHECK: [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrndm_f32(float32x2_t a) { - // CHECK-LABEL: test_vrndm_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) return vrndm_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) #2 +// CHECK: [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndmq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndmq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) return vrndmq_f32(a); } +// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) #2 +// CHECK: [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrndn_f32(float32x2_t a) { - // CHECK-LABEL: test_vrndn_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) return vrndn_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) #2 +// CHECK: [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndnq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndnq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) return vrndnq_f32(a); } +// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) #2 +// CHECK: [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrndp_f32(float32x2_t a) { - // CHECK-LABEL: test_vrndp_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) return vrndp_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) #2 +// CHECK: [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndpq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndpq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) return vrndpq_f32(a); } +// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) #2 +// CHECK: [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrndx_f32(float32x2_t a) { - // CHECK-LABEL: test_vrndx_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) return vrndx_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) #2 +// CHECK: [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndxq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndxq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) return vrndxq_f32(a); } +// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) #2 +// CHECK: [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP1]] float32x2_t test_vrnd_f32(float32x2_t a) { - // CHECK-LABEL: test_vrnd_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) return vrnd_f32(a); } +// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) #2 +// CHECK: [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vrndq_f32(float32x4_t a) { - // CHECK-LABEL: test_vrndq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) return vrndq_f32(a); } Index: test/CodeGen/arm-neon-fma.c =================================================================== --- test/CodeGen/arm-neon-fma.c +++ test/CodeGen/arm-neon-fma.c @@ -1,19 +1,34 @@ -// REQUIRES: arm-registered-target // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \ // RUN: -target-abi aapcs \ // RUN: -target-cpu cortex-a8 \ // RUN: -mfloat-abi hard \ // RUN: -ffreestanding \ -// RUN: -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %accum to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %lhs to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %rhs to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2 +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) { return vfma_f32(accum, lhs, rhs); -// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum) } +// CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %accum to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %lhs to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %rhs to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2 +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) { return vfmaq_f32(accum, lhs, rhs); -// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum) } Index: test/CodeGen/arm-neon-numeric-maxmin.c =================================================================== --- test/CodeGen/arm-neon-numeric-maxmin.c +++ test/CodeGen/arm-neon-numeric-maxmin.c @@ -1,27 +1,55 @@ -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) #2 +// CHECK: [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vmaxnm_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b) return vmaxnm_f32(a, b); } +// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) #2 +// CHECK: [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vmaxnmq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b) return vmaxnmq_f32(a, b); } +// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) #2 +// CHECK: [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { - // CHECK-LABEL: test_vminnm_f32 - // CHECK: call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b) return vminnm_f32(a, b); } +// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) #2 +// CHECK: [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { - // CHECK-LABEL: test_vminnmq_f32 - // CHECK: call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b) return vminnmq_f32(a, b); } Index: test/CodeGen/arm-neon-shifts.c =================================================================== --- test/CodeGen/arm-neon-shifts.c +++ test/CodeGen/arm-neon-shifts.c @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple thumbv7-apple-darwin \ // RUN: -target-cpu cortex-a8 \ // RUN: -ffreestanding \ -// RUN: -emit-llvm -w -O1 -o - %s | FileCheck %s +// RUN: -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s #include @@ -27,19 +27,20 @@ uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_shift_vsra // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, - // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a + // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]] return vsra_n_u8(a, b, 5); } int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) { // CHECK-LABEL: test_shift_vsra_smax // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, - // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a + // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]] return vsra_n_s8(a, b, 8); } uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_shift_vsra_umax - // CHECK: ret <8 x i8> %a + // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer + // CHECK: ret <8 x i8> [[RES]] return vsra_n_u8(a, b, 8); } Index: test/CodeGen/arm-neon-vcvtX.c =================================================================== --- test/CodeGen/arm-neon-vcvtX.c +++ test/CodeGen/arm-neon-vcvtX.c @@ -1,99 +1,147 @@ -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s #include +// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTA_S32_V1_I]] int32x2_t test_vcvta_s32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvta_s32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a) return vcvta_s32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTA_U32_V1_I]] uint32x2_t test_vcvta_u32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvta_u32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a) return vcvta_u32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTAQ_S32_V1_I]] int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtaq_s32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a) return vcvtaq_s32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTAQ_U32_V1_I]] uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtaq_u32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a) return vcvtaq_u32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTN_S32_V1_I]] int32x2_t test_vcvtn_s32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtn_s32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a) return vcvtn_s32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTN_U32_V1_I]] uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtn_u32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a) return vcvtn_u32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTNQ_S32_V1_I]] int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtnq_s32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a) return vcvtnq_s32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTNQ_U32_V1_I]] uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtnq_u32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a) return vcvtnq_u32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTP_S32_V1_I]] int32x2_t test_vcvtp_s32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtp_s32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a) return vcvtp_s32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTP_U32_V1_I]] uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtp_u32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a) return vcvtp_u32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTPQ_S32_V1_I]] int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtpq_s32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a) return vcvtpq_s32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTPQ_U32_V1_I]] uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtpq_u32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a) return vcvtpq_u32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTM_S32_V1_I]] int32x2_t test_vcvtm_s32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtm_s32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a) return vcvtm_s32_f32(a); } +// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) #2 +// CHECK: ret <2 x i32> [[VCVTM_U32_V1_I]] uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { - // CHECK-LABEL: test_vcvtm_u32_f32 - // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a) return vcvtm_u32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTMQ_S32_V1_I]] int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtmq_s32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a) return vcvtmq_s32_f32(a); } +// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) #2 +// CHECK: ret <4 x i32> [[VCVTMQ_U32_V1_I]] uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) { - // CHECK-LABEL: test_vcvtmq_u32_f32 - // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a) return vcvtmq_u32_f32(a); } Index: test/CodeGen/arm-neon-vget.c =================================================================== --- test/CodeGen/arm-neon-vget.c +++ test/CodeGen/arm-neon-vget.c @@ -1,124 +1,123 @@ -// REQUIRES: arm-registered-target // RUN: %clang_cc1 -triple thumbv7-apple-darwin \ // RUN: -target-abi apcs-gnu \ // RUN: -target-cpu cortex-a8 \ // RUN: -mfloat-abi soft \ // RUN: -target-feature +soft-float-abi \ // RUN: -ffreestanding \ -// RUN: -emit-llvm -w -O1 -o - %s | FileCheck %s +// RUN: -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s #include // Check that the vget_low/vget_high intrinsics generate a single shuffle // without any bitcasting. int8x8_t low_s8(int8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_low_s8(a); } uint8x8_t low_u8 (uint8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_low_u8(a); } int16x4_t low_s16( int16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_low_s16(a); } uint16x4_t low_u16(uint16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_low_u16(a); } int32x2_t low_s32( int32x4_t a) { -// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> +// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> return vget_low_s32(a); } uint32x2_t low_u32(uint32x4_t a) { -// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> +// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> return vget_low_u32(a); } int64x1_t low_s64( int64x2_t a) { -// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer +// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer return vget_low_s64(a); } uint64x1_t low_u64(uint64x2_t a) { -// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer +// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer return vget_low_u64(a); } poly8x8_t low_p8 (poly8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_low_p8(a); } poly16x4_t low_p16(poly16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_low_p16(a); } float32x2_t low_f32(float32x4_t a) { -// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> +// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> return vget_low_f32(a); } int8x8_t high_s8(int8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_high_s8(a); } uint8x8_t high_u8 (uint8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_high_u8(a); } int16x4_t high_s16( int16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_high_s16(a); } uint16x4_t high_u16(uint16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_high_u16(a); } int32x2_t high_s32( int32x4_t a) { -// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> +// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> return vget_high_s32(a); } uint32x2_t high_u32(uint32x4_t a) { -// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> +// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> return vget_high_u32(a); } int64x1_t high_s64( int64x2_t a) { -// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> +// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> return vget_high_s64(a); } uint64x1_t high_u64(uint64x2_t a) { -// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> +// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> return vget_high_u64(a); } poly8x8_t high_p8 (poly8x16_t a) { -// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> +// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> return vget_high_p8(a); } poly16x4_t high_p16(poly16x8_t a) { -// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> +// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> return vget_high_p16(a); } float32x2_t high_f32(float32x4_t a) { -// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> +// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> return vget_high_f32(a); } Index: test/CodeGen/arm64-be-bitfield.c =================================================================== --- test/CodeGen/arm64-be-bitfield.c +++ test/CodeGen/arm64-be-bitfield.c @@ -1,6 +1,4 @@ -// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s -// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -S -O1 -o - %s | FileCheck --check-prefix ARM %s struct bt3 { signed b2:10; signed b3:10; } b16; @@ -10,6 +8,5 @@ // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8* // IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4 -// ARM: asr x0, x0, #54 return bp11.b2; } Index: test/CodeGen/arm64-crc32.c =================================================================== --- test/CodeGen/arm64-crc32.c +++ test/CodeGen/arm64-crc32.c @@ -1,6 +1,6 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple arm64-none-linux-gnu \ -// RUN: -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s int crc32b(int a, char b) { Index: test/CodeGen/arm64-lanes.c =================================================================== --- test/CodeGen/arm64-lanes.c +++ test/CodeGen/arm64-lanes.c @@ -1,74 +1,127 @@ -// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -O3 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-BE +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix CHECK-BE #include -// CHECK-LABEL: @test_vdupb_lane_s8 int8_t test_vdupb_lane_s8(int8x8_t src) { return vdupb_lane_s8(src, 2); + // CHECK-LABEL: @test_vdupb_lane_s8 // CHECK: extractelement <8 x i8> %src, i32 2 - // CHECK-BE: extractelement <8 x i8> %src, i32 5 + + // CHECK-BE-LABEL: @test_vdupb_lane_s8 + // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> + // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2 } -// CHECK-LABEL: @test_vdupb_lane_u8 uint8_t test_vdupb_lane_u8(uint8x8_t src) { return vdupb_lane_u8(src, 2); + // CHECK-LABEL: @test_vdupb_lane_u8 // CHECK: extractelement <8 x i8> %src, i32 2 - // CHECK-BE: extractelement <8 x i8> %src, i32 5 + + // CHECK-BE-LABEL: @test_vdupb_lane_u8 + // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> + // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2 } -// CHECK-LABEL: @test_vduph_lane_s16 int16_t test_vduph_lane_s16(int16x4_t src) { return vduph_lane_s16(src, 2); - // CHECK: extractelement <4 x i16> %src, i32 2 - // CHECK-BE: extractelement <4 x i16> %src, i32 1 + // CHECK-LABEL: @test_vduph_lane_s16 + // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16> + // CHECK: extractelement <4 x i16> [[TMP2]], i32 2 + + // CHECK-BE-LABEL: @test_vduph_lane_s16 + // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> + // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]] + // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16> + // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2 } -// CHECK-LABEL: @test_vduph_lane_u16 uint16_t test_vduph_lane_u16(uint16x4_t src) { return vduph_lane_u16(src, 2); - // CHECK: extractelement <4 x i16> %src, i32 2 - // CHECK-BE: extractelement <4 x i16> %src, i32 1 + // CHECK-LABEL: @test_vduph_lane_u16 + // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16> + // CHECK: extractelement <4 x i16> [[TMP2]], i32 2 + + // CHECK-BE-LABEL: @test_vduph_lane_u16 + // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> + // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]] + // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16> + // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2 } -// CHECK-LABEL: @test_vdups_lane_s32 int32_t test_vdups_lane_s32(int32x2_t src) { return vdups_lane_s32(src, 0); - // CHECK: extractelement <2 x i32> %src, i32 0 - // CHECK-BE: extractelement <2 x i32> %src, i32 1 + // CHECK-LABEL: @test_vdups_lane_s32 + // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32> + // CHECK: extractelement <2 x i32> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdups_lane_s32 + // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> + // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]] + // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32> + // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0 } -// CHECK-LABEL: @test_vdups_lane_u32 uint32_t test_vdups_lane_u32(uint32x2_t src) { return vdups_lane_u32(src, 0); - // CHECK: extractelement <2 x i32> %src, i32 0 - // CHECK-BE: extractelement <2 x i32> %src, i32 1 + // CHECK-LABEL: @test_vdups_lane_u32 + // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32> + // CHECK: extractelement <2 x i32> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdups_lane_u32 + // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> + // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]] + // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32> + // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0 } -// CHECK-LABEL: @test_vdups_lane_f32 float32_t test_vdups_lane_f32(float32x2_t src) { return vdups_lane_f32(src, 0); - // CHECK: extractelement <2 x float> %src, i32 0 - // CHECK-BE: extractelement <2 x float> %src, i32 1 + // CHECK-LABEL: @test_vdups_lane_f32 + // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float> + // CHECK: extractelement <2 x float> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdups_lane_f32 + // CHECK-BE: [[REV:%.*]] = shufflevector <2 x float> {{.*}}, <2 x i32> + // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x float> [[REV]] to [[TYPE:.*]] + // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float> + // CHECK-BE: extractelement <2 x float> [[TMP2]], i32 0 } -// CHECK-LABEL: @test_vdupd_lane_s64 int64_t test_vdupd_lane_s64(int64x1_t src) { return vdupd_lane_s64(src, 0); - // CHECK: extractelement <1 x i64> %src, i32 0 - // CHECK-BE: extractelement <1 x i64> %src, i32 0 + // CHECK-LABEL: @test_vdupd_lane_s64 + // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64> + // CHECK: extractelement <1 x i64> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdupd_lane_s64 + // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0 } -// CHECK-LABEL: @test_vdupd_lane_u64 uint64_t test_vdupd_lane_u64(uint64x1_t src) { return vdupd_lane_u64(src, 0); - // CHECK: extractelement <1 x i64> %src, i32 0 - // CHECK-BE: extractelement <1 x i64> %src, i32 0 + // CHECK-LABEL: @test_vdupd_lane_u64 + // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64> + // CHECK: extractelement <1 x i64> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdupd_lane_u64 + // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0 } -// CHECK-LABEL: @test_vdupd_lane_f64 float64_t test_vdupd_lane_f64(float64x1_t src) { return vdupd_lane_f64(src, 0); - // CHECK: extractelement <1 x double> %src, i32 0 - // CHECK-BE: extractelement <1 x double> %src, i32 0 + // CHECK-LABEL: @test_vdupd_lane_f64 + // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %src to [[TYPE:.*]] + // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x double> + // CHECK: extractelement <1 x double> [[TMP2]], i32 0 + + // CHECK-BE-LABEL: @test_vdupd_lane_f64 + // CHECK-BE: extractelement <1 x double> {{.*}}, i32 0 } Index: test/CodeGen/arm64_vcopy.c =================================================================== --- test/CodeGen/arm64_vcopy.c +++ test/CodeGen/arm64_vcopy.c @@ -1,69 +1,121 @@ -// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane* #include +// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %a1, <16 x i8> %a2) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13 +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3 +// CHECK: ret <16 x i8> [[VSET_LANE]] int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_s8 return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13); - // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> } +// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_u8(<16 x i8> %a1, <16 x i8> %a2) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13 +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3 +// CHECK: ret <16 x i8> [[VSET_LANE]] uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_u8 return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13); - // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> } +// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_s16(<8 x i16> %a1, <8 x i16> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3 +// CHECK: ret <8 x i16> [[VSET_LANE]] int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_s16 return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7); - // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> } +// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_u16(<8 x i16> %a1, <8 x i16> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3 +// CHECK: ret <8 x i16> [[VSET_LANE]] uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_u16 return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7); - // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> } +// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_s32(<4 x i32> %a1, <4 x i32> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_s32 return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3); - // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> } +// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_u32(<4 x i32> %a1, <4 x i32> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_u32 return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3); - // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> } +// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_s64(<2 x i64> %a1, <2 x i64> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0 +// CHECK: ret <2 x i64> [[VSET_LANE]] int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_s64 return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1); - // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> } +// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_u64(<2 x i64> %a1, <2 x i64> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0 +// CHECK: ret <2 x i64> [[VSET_LANE]] uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_u64 return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1); - // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> } +// CHECK-LABEL: define <4 x float> @test_vcopyq_laneq_f32(<4 x float> %a1, <4 x float> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP3]], float [[VGETQ_LANE]], i32 0 +// CHECK: ret <4 x float> [[VSET_LANE]] float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_f32 return vcopyq_laneq_f32(a1, 0, a2, 3); - // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> } +// CHECK-LABEL: define <2 x double> @test_vcopyq_laneq_f64(<2 x double> %a1, <2 x double> %a2) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a2 to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %a1 to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x double> [[TMP3]], double [[VGETQ_LANE]], i32 0 +// CHECK: ret <2 x double> [[VSET_LANE]] float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) { - // CHECK-LABEL: test_vcopyq_laneq_f64 return vcopyq_laneq_f64(a1, 0, a2, 1); - // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> } Index: test/CodeGen/arm64_vcreate.c =================================================================== --- test/CodeGen/arm64_vcreate.c +++ test/CodeGen/arm64_vcreate.c @@ -1,7 +1,6 @@ -// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s // Test ARM64 SIMD vcreate intrinsics -/*#include */ #include float32x2_t test_vcreate_f32(uint64_t a1) { @@ -10,14 +9,3 @@ // CHECK: bitcast {{.*}} to <2 x float> // CHECK-NEXT: ret } - -// FIXME enable when scalar_to_vector in backend is fixed. Also, change -// CHECK@ to CHECK and CHECK-NEXT@ to CHECK-NEXT -/* -float64x1_t test_vcreate_f64(uint64_t a1) { - // CHECK@ test_vcreate_f64 - return vcreate_f64(a1); - // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32 - // CHECK-NEXT@ ret -} -*/ Index: test/CodeGen/arm64_vdupq_n_f64.c =================================================================== --- test/CodeGen/arm64_vdupq_n_f64.c +++ test/CodeGen/arm64_vdupq_n_f64.c @@ -1,88 +1,78 @@ -// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s -// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | \ -// RUN: FileCheck -check-prefix=CHECK-IR %s -// REQUIRES: aarch64-registered-target - -/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics -// ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics -// missing - +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -fallow-half-arguments-and-returns -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s #include // vdupq_n_f64 -> dup.2d v0, v0[0] // -float64x2_t test_vdupq_n_f64(float64_t w) -{ +// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(double %w) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 +// CHECK: ret <2 x double> [[VECINIT1_I]] +float64x2_t test_vdupq_n_f64(float64_t w) { return vdupq_n_f64(w); - // CHECK-LABEL: test_vdupq_n_f64: - // CHECK: dup.2d v0, v0[0] - // CHECK-NEXT: ret } // might as well test this while we're here // vdupq_n_f32 -> dup.4s v0, v0[0] -float32x4_t test_vdupq_n_f32(float32_t w) -{ +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %w) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %w, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3 +// CHECK: ret <4 x float> [[VECINIT3_I]] +float32x4_t test_vdupq_n_f32(float32_t w) { return vdupq_n_f32(w); - // CHECK-LABEL: test_vdupq_n_f32: - // CHECK: dup.4s v0, v0[0] - // CHECK-NEXT: ret } // vdupq_lane_f64 -> dup.2d v0, v0[0] // this was in , but had already been implemented, // test anyway -float64x2_t test_vdupq_lane_f64(float64x1_t V) -{ +// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer +// CHECK: ret <2 x double> [[SHUFFLE]] +float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); - // CHECK-LABEL: test_vdupq_lane_f64: - // CHECK: dup.2d v0, v0[0] - // CHECK-NEXT: ret } // vmovq_n_f64 -> dup Vd.2d,X0 // this wasn't in , but it was between the vdups -float64x2_t test_vmovq_n_f64(float64_t w) -{ +// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(double %w) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 +// CHECK: ret <2 x double> [[VECINIT1_I]] +float64x2_t test_vmovq_n_f64(float64_t w) { return vmovq_n_f64(w); - // CHECK-LABEL: test_vmovq_n_f64: - // CHECK: dup.2d v0, v0[0] - // CHECK-NEXT: ret } -float16x4_t test_vmov_n_f16(float16_t *a1) -{ - // CHECK-IR-LABEL: test_vmov_n_f16 +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a1) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a1, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: ret <4 x half> [[VECINIT]]3 +float16x4_t test_vmov_n_f16(float16_t *a1) { return vmov_n_f16(*a1); - // CHECK-IR: insertelement {{.*}} i32 0{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 1{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 2{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 3{{ *$}} } -// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to -// CHECK-IR /* -float64x1_t test_vmov_n_f64(float64_t a1) -{ - // CHECK-IR@ test_vmov_n_f64 +float64x1_t test_vmov_n_f64(float64_t a1) { return vmov_n_f64(a1); - // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}} } */ -float16x8_t test_vmovq_n_f16(float16_t *a1) -{ - // CHECK-IR-LABEL: test_vmovq_n_f16 +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a1) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a1, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4 +// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5 +// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6 +// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7 +// CHECK: ret <8 x half> [[VECINIT]]7 +float16x8_t test_vmovq_n_f16(float16_t *a1) { return vmovq_n_f16(*a1); - // CHECK-IR: insertelement {{.*}} i32 0{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 1{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 2{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 3{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 4{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 5{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 6{{ *$}} - // CHECK-IR: insertelement {{.*}} i32 7{{ *$}} } Index: test/CodeGen/arm_neon_intrinsics.c =================================================================== --- test/CodeGen/arm_neon_intrinsics.c +++ test/CodeGen/arm_neon_intrinsics.c @@ -1,1611 +1,2446 @@ // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\ -// RUN: -target-cpu swift -ffreestanding -Os -S -o - %s\ -// RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT -// RUN: %clang_cc1 -triple armv8-linux-gnu \ -// RUN: -target-cpu cortex-a57 -mfloat-abi soft -ffreestanding -Os -S -o - %s\ -// RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-A57 +// RUN: -target-cpu swift -ffreestanding -emit-llvm -S -o - %s\ +// RUN: opt -S -mem2reg | FileCheck %s // REQUIRES: long_tests #include -// CHECK-LABEL: test_vaba_s8 -// CHECK: vaba.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]] +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vaba_s8(a, b, c); } -// CHECK-LABEL: test_vaba_s16 -// CHECK: vaba.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]] +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vaba_s16(a, b, c); } -// CHECK-LABEL: test_vaba_s32 -// CHECK: vaba.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vaba_s32(a, b, c); } -// CHECK-LABEL: test_vaba_u8 -// CHECK: vaba.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]] +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vaba_u8(a, b, c); } -// CHECK-LABEL: test_vaba_u16 -// CHECK: vaba.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vaba_u16(a, b, c); } -// CHECK-LABEL: test_vaba_u32 -// CHECK: vaba.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vaba_u32(a, b, c); } -// CHECK-LABEL: test_vabaq_s8 -// CHECK: vaba.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]] +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vabaq_s8(a, b, c); } -// CHECK-LABEL: test_vabaq_s16 -// CHECK: vaba.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vabaq_s16(a, b, c); } -// CHECK-LABEL: test_vabaq_s32 -// CHECK: vaba.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vabaq_s32(a, b, c); } -// CHECK-LABEL: test_vabaq_u8 -// CHECK: vaba.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]] +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vabaq_u8(a, b, c); } -// CHECK-LABEL: test_vabaq_u16 -// CHECK: vaba.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vabaq_u16(a, b, c); } -// CHECK-LABEL: test_vabaq_u32 -// CHECK: vaba.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vabaq_u32(a, b, c); } -// CHECK-LABEL: test_vabal_s8 -// CHECK: vabal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); } -// CHECK-LABEL: test_vabal_s16 -// CHECK: vabal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); } -// CHECK-LABEL: test_vabal_s32 -// CHECK: vabal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); } -// CHECK-LABEL: test_vabal_u8 -// CHECK: vabal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); } -// CHECK-LABEL: test_vabal_u16 -// CHECK: vabal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); } -// CHECK-LABEL: test_vabal_u32 -// CHECK: vabal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); } -// CHECK-LABEL: test_vabd_s8 -// CHECK: vabd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VABD_I]] int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) { return vabd_s8(a, b); } -// CHECK-LABEL: test_vabd_s16 -// CHECK: vabd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4 +// CHECK: ret <4 x i16> [[VABD2_I]] int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) { return vabd_s16(a, b); } -// CHECK-LABEL: test_vabd_s32 -// CHECK: vabd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4 +// CHECK: ret <2 x i32> [[VABD2_I]] int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) { return vabd_s32(a, b); } -// CHECK-LABEL: test_vabd_u8 -// CHECK: vabd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VABD_I]] uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) { return vabd_u8(a, b); } -// CHECK-LABEL: test_vabd_u16 -// CHECK: vabd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4 +// CHECK: ret <4 x i16> [[VABD2_I]] uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) { return vabd_u16(a, b); } -// CHECK-LABEL: test_vabd_u32 -// CHECK: vabd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4 +// CHECK: ret <2 x i32> [[VABD2_I]] uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) { return vabd_u32(a, b); } -// CHECK-LABEL: test_vabd_f32 -// CHECK: vabd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4 +// CHECK: ret <2 x float> [[VABD2_I]] float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) { return vabd_f32(a, b); } -// CHECK-LABEL: test_vabdq_s8 -// CHECK: vabd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VABD_I]] int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) { return vabdq_s8(a, b); } -// CHECK-LABEL: test_vabdq_s16 -// CHECK: vabd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4 +// CHECK: ret <8 x i16> [[VABD2_I]] int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) { return vabdq_s16(a, b); } -// CHECK-LABEL: test_vabdq_s32 -// CHECK: vabd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4 +// CHECK: ret <4 x i32> [[VABD2_I]] int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) { return vabdq_s32(a, b); } -// CHECK-LABEL: test_vabdq_u8 -// CHECK: vabd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VABD_I]] uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) { return vabdq_u8(a, b); } -// CHECK-LABEL: test_vabdq_u16 -// CHECK: vabd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4 +// CHECK: ret <8 x i16> [[VABD2_I]] uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) { return vabdq_u16(a, b); } -// CHECK-LABEL: test_vabdq_u32 -// CHECK: vabd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4 +// CHECK: ret <4 x i32> [[VABD2_I]] uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) { return vabdq_u32(a, b); } -// CHECK-LABEL: test_vabdq_f32 -// CHECK: vabd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4 +// CHECK: ret <4 x float> [[VABD2_I]] float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) { return vabdq_f32(a, b); } -// CHECK-LABEL: test_vabdl_s8 -// CHECK: vabdl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I]] int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); } -// CHECK-LABEL: test_vabdl_s16 -// CHECK: vabdl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I]] int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); } -// CHECK-LABEL: test_vabdl_s32 -// CHECK: vabdl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I]] int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); } -// CHECK-LABEL: test_vabdl_u8 -// CHECK: vabdl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I_I]] uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); } -// CHECK-LABEL: test_vabdl_u16 -// CHECK: vabdl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I_I]] uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); } -// CHECK-LABEL: test_vabdl_u32 -// CHECK: vabdl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I_I]] uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); } -// CHECK-LABEL: test_vabs_s8 -// CHECK: vabs.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 { +// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VABS_I]] int8x8_t test_vabs_s8(int8x8_t a) { return vabs_s8(a); } -// CHECK-LABEL: test_vabs_s16 -// CHECK: vabs.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #4 +// CHECK: ret <4 x i16> [[VABS1_I]] int16x4_t test_vabs_s16(int16x4_t a) { return vabs_s16(a); } -// CHECK-LABEL: test_vabs_s32 -// CHECK: vabs.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #4 +// CHECK: ret <2 x i32> [[VABS1_I]] int32x2_t test_vabs_s32(int32x2_t a) { return vabs_s32(a); } -// CHECK-LABEL: test_vabs_f32 -// CHECK: vabs.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4 +// CHECK: ret <2 x float> [[VABS1_I]] float32x2_t test_vabs_f32(float32x2_t a) { return vabs_f32(a); } -// CHECK-LABEL: test_vabsq_s8 -// CHECK: vabs.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VABS_I]] int8x16_t test_vabsq_s8(int8x16_t a) { return vabsq_s8(a); } -// CHECK-LABEL: test_vabsq_s16 -// CHECK: vabs.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #4 +// CHECK: ret <8 x i16> [[VABS1_I]] int16x8_t test_vabsq_s16(int16x8_t a) { return vabsq_s16(a); } -// CHECK-LABEL: test_vabsq_s32 -// CHECK: vabs.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #4 +// CHECK: ret <4 x i32> [[VABS1_I]] int32x4_t test_vabsq_s32(int32x4_t a) { return vabsq_s32(a); } -// CHECK-LABEL: test_vabsq_f32 -// CHECK: vabs.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4 +// CHECK: ret <4 x float> [[VABS1_I]] float32x4_t test_vabsq_f32(float32x4_t a) { return vabsq_f32(a); } -// CHECK-LABEL: test_vadd_s8 -// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) { return vadd_s8(a, b); } -// CHECK-LABEL: test_vadd_s16 -// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) { return vadd_s16(a, b); } -// CHECK-LABEL: test_vadd_s32 -// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) { return vadd_s32(a, b); } -// CHECK-LABEL: test_vadd_s64 -// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[ADD_I]] int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) { return vadd_s64(a, b); } -// CHECK-LABEL: test_vadd_f32 -// CHECK: vadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, %b +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) { return vadd_f32(a, b); } -// CHECK-LABEL: test_vadd_u8 -// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) { return vadd_u8(a, b); } -// CHECK-LABEL: test_vadd_u16 -// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) { return vadd_u16(a, b); } -// CHECK-LABEL: test_vadd_u32 -// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) { return vadd_u32(a, b); } -// CHECK-LABEL: test_vadd_u64 -// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[ADD_I]] uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) { return vadd_u64(a, b); } -// CHECK-LABEL: test_vaddq_s8 -// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) { return vaddq_s8(a, b); } -// CHECK-LABEL: test_vaddq_s16 -// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) { return vaddq_s16(a, b); } -// CHECK-LABEL: test_vaddq_s32 -// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) { return vaddq_s32(a, b); } -// CHECK-LABEL: test_vaddq_s64 -// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) { return vaddq_s64(a, b); } -// CHECK-LABEL: test_vaddq_f32 -// CHECK: vadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, %b +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) { return vaddq_f32(a, b); } -// CHECK-LABEL: test_vaddq_u8 -// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) { return vaddq_u8(a, b); } -// CHECK-LABEL: test_vaddq_u16 -// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) { return vaddq_u16(a, b); } -// CHECK-LABEL: test_vaddq_u32 -// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); } -// CHECK-LABEL: test_vaddq_u64 -// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) { return vaddq_u64(a, b); } -// CHECK-LABEL: test_vaddhn_s16 -// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VADDHN2_I]] int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); } -// CHECK-LABEL: test_vaddhn_s32 -// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VADDHN2_I]] int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); } -// CHECK-LABEL: test_vaddhn_s64 -// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VADDHN2_I]] int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); } -// CHECK-LABEL: test_vaddhn_u16 -// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VADDHN2_I]] uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); } -// CHECK-LABEL: test_vaddhn_u32 -// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VADDHN2_I]] uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); } -// CHECK-LABEL: test_vaddhn_u64 -// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], +// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VADDHN2_I]] uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); } -// CHECK-LABEL: test_vaddl_s8 -// CHECK: vaddl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); } -// CHECK-LABEL: test_vaddl_s16 -// CHECK: vaddl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); } -// CHECK-LABEL: test_vaddl_s32 -// CHECK: vaddl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); } -// CHECK-LABEL: test_vaddl_u8 -// CHECK: vaddl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); } -// CHECK-LABEL: test_vaddl_u16 -// CHECK: vaddl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); } -// CHECK-LABEL: test_vaddl_u32 -// CHECK: vaddl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); } -// CHECK-LABEL: test_vaddw_s8 -// CHECK: vaddw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); } -// CHECK-LABEL: test_vaddw_s16 -// CHECK: vaddw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); } -// CHECK-LABEL: test_vaddw_s32 -// CHECK: vaddw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); } -// CHECK-LABEL: test_vaddw_u8 -// CHECK: vaddw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); } -// CHECK-LABEL: test_vaddw_u16 -// CHECK: vaddw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); } -// CHECK-LABEL: test_vaddw_u32 -// CHECK: vaddw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); } -// CHECK-LABEL: test_vand_s8 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[AND_I]] int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) { return vand_s8(a, b); } -// CHECK-LABEL: test_vand_s16 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[AND_I]] int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) { return vand_s16(a, b); } -// CHECK-LABEL: test_vand_s32 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[AND_I]] int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) { return vand_s32(a, b); } -// CHECK-LABEL: test_vand_s64 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[AND_I]] int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) { return vand_s64(a, b); } -// CHECK-LABEL: test_vand_u8 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[AND_I]] uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) { return vand_u8(a, b); } -// CHECK-LABEL: test_vand_u16 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[AND_I]] uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) { return vand_u16(a, b); } -// CHECK-LABEL: test_vand_u32 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[AND_I]] uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) { return vand_u32(a, b); } -// CHECK-LABEL: test_vand_u64 -// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[AND_I]] uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) { return vand_u64(a, b); } -// CHECK-LABEL: test_vandq_s8 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[AND_I]] int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) { return vandq_s8(a, b); } -// CHECK-LABEL: test_vandq_s16 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[AND_I]] int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) { return vandq_s16(a, b); } -// CHECK-LABEL: test_vandq_s32 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[AND_I]] int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) { return vandq_s32(a, b); } -// CHECK-LABEL: test_vandq_s64 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[AND_I]] int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) { return vandq_s64(a, b); } -// CHECK-LABEL: test_vandq_u8 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[AND_I]] uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) { return vandq_u8(a, b); } -// CHECK-LABEL: test_vandq_u16 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[AND_I]] uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) { return vandq_u16(a, b); } -// CHECK-LABEL: test_vandq_u32 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[AND_I]] uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) { return vandq_u32(a, b); } -// CHECK-LABEL: test_vandq_u64 -// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[AND_I]] uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) { return vandq_u64(a, b); } -// CHECK-LABEL: test_vbic_s8 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[AND_I]] int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) { return vbic_s8(a, b); } -// CHECK-LABEL: test_vbic_s16 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[AND_I]] int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) { return vbic_s16(a, b); } -// CHECK-LABEL: test_vbic_s32 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[AND_I]] int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) { return vbic_s32(a, b); } -// CHECK-LABEL: test_vbic_s64 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[AND_I]] int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) { return vbic_s64(a, b); } -// CHECK-LABEL: test_vbic_u8 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[AND_I]] uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) { return vbic_u8(a, b); } -// CHECK-LABEL: test_vbic_u16 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[AND_I]] uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) { return vbic_u16(a, b); } -// CHECK-LABEL: test_vbic_u32 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[AND_I]] uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) { return vbic_u32(a, b); } -// CHECK-LABEL: test_vbic_u64 -// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[AND_I]] uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) { return vbic_u64(a, b); } -// CHECK-LABEL: test_vbicq_s8 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[AND_I]] int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) { return vbicq_s8(a, b); } -// CHECK-LABEL: test_vbicq_s16 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[AND_I]] int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) { return vbicq_s16(a, b); } -// CHECK-LABEL: test_vbicq_s32 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[AND_I]] int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) { return vbicq_s32(a, b); } -// CHECK-LABEL: test_vbicq_s64 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[AND_I]] int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) { return vbicq_s64(a, b); } -// CHECK-LABEL: test_vbicq_u8 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[AND_I]] uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) { return vbicq_u8(a, b); } -// CHECK-LABEL: test_vbicq_u16 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[AND_I]] uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) { return vbicq_u16(a, b); } -// CHECK-LABEL: test_vbicq_u32 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[AND_I]] uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) { return vbicq_u32(a, b); } -// CHECK-LABEL: test_vbicq_u64 -// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[AND_I]] uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) { return vbicq_u64(a, b); } -// CHECK-LABEL: test_vbsl_s8 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) { return vbsl_s8(a, b, c); } -// CHECK-LABEL: test_vbsl_s16 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) { return vbsl_s16(a, b, c); } -// CHECK-LABEL: test_vbsl_s32 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i32> [[VBSL5_I]] int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) { return vbsl_s32(a, b, c); } -// CHECK-LABEL: test_vbsl_s64 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <1 x i64> [[VBSL5_I]] int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) { return vbsl_s64(a, b, c); } -// CHECK-LABEL: test_vbsl_u8 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vbsl_u8(a, b, c); } -// CHECK-LABEL: test_vbsl_u16 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vbsl_u16(a, b, c); } -// CHECK-LABEL: test_vbsl_u32 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i32> [[VBSL5_I]] uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vbsl_u32(a, b, c); } -// CHECK-LABEL: test_vbsl_u64 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <1 x i64> [[VBSL5_I]] uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) { return vbsl_u64(a, b, c); } -// CHECK-LABEL: test_vbsl_f32 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP4]] float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) { return vbsl_f32(a, b, c); } -// CHECK-LABEL: test_vbsl_p8 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <8 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <8 x i8> [[VBSL2_I]] poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) { return vbsl_p8(a, b, c); } -// CHECK-LABEL: test_vbsl_p16 -// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) { return vbsl_p16(a, b, c); } -// CHECK-LABEL: test_vbslq_s8 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); } -// CHECK-LABEL: test_vbslq_s16 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); } -// CHECK-LABEL: test_vbslq_s32 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i32> [[VBSL5_I]] int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); } -// CHECK-LABEL: test_vbslq_s64 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i64> [[VBSL5_I]] int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return vbslq_s64(a, b, c); } -// CHECK-LABEL: test_vbslq_u8 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); } -// CHECK-LABEL: test_vbslq_u16 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); } -// CHECK-LABEL: test_vbslq_u32 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i32> [[VBSL5_I]] uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); } -// CHECK-LABEL: test_vbslq_u64 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <2 x i64> [[VBSL5_I]] uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return vbslq_u64(a, b, c); } -// CHECK-LABEL: test_vbslq_f32 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP4]] float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); } -// CHECK-LABEL: test_vbslq_p8 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP0:%.*]] = xor <16 x i8> %a, +// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c +// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK: ret <16 x i8> [[VBSL2_I]] poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) { return vbslq_p8(a, b, c); } -// CHECK-LABEL: test_vbslq_p16 -// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <8 x i16> [[VBSL5_I]] poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) { return vbslq_p16(a, b, c); } -// CHECK-LABEL: test_vcage_f32 -// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCAGE_V2_I]] uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) { return vcage_f32(a, b); } -// CHECK-LABEL: test_vcageq_f32 -// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) { return vcageq_f32(a, b); } -// CHECK-LABEL: test_vcagt_f32 -// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCAGT_V2_I]] uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) { return vcagt_f32(a, b); } -// CHECK-LABEL: test_vcagtq_f32 -// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) { return vcagtq_f32(a, b); } -// CHECK-LABEL: test_vcale_f32 -// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCALE_V2_I]] uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) { return vcale_f32(a, b); } -// CHECK-LABEL: test_vcaleq_f32 -// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) { return vcaleq_f32(a, b); } -// CHECK-LABEL: test_vcalt_f32 -// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4 +// CHECK: ret <2 x i32> [[VCALT_V2_I]] uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) { return vcalt_f32(a, b); } -// CHECK-LABEL: test_vcaltq_f32 -// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4 +// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) { return vcaltq_f32(a, b); } -// CHECK-LABEL: test_vceq_s8 -// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) { return vceq_s8(a, b); } -// CHECK-LABEL: test_vceq_s16 -// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) { return vceq_s16(a, b); } -// CHECK-LABEL: test_vceq_s32 -// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) { return vceq_s32(a, b); } -// CHECK-LABEL: test_vceq_f32 -// CHECK: vceq.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) { return vceq_f32(a, b); } -// CHECK-LABEL: test_vceq_u8 -// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) { return vceq_u8(a, b); } -// CHECK-LABEL: test_vceq_u16 -// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) { return vceq_u16(a, b); } -// CHECK-LABEL: test_vceq_u32 -// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) { return vceq_u32(a, b); } -// CHECK-LABEL: test_vceq_p8 -// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) { return vceq_p8(a, b); } -// CHECK-LABEL: test_vceqq_s8 -// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) { return vceqq_s8(a, b); } -// CHECK-LABEL: test_vceqq_s16 -// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) { return vceqq_s16(a, b); } -// CHECK-LABEL: test_vceqq_s32 -// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) { return vceqq_s32(a, b); } -// CHECK-LABEL: test_vceqq_f32 -// CHECK: vceq.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) { return vceqq_f32(a, b); } -// CHECK-LABEL: test_vceqq_u8 -// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) { return vceqq_u8(a, b); } -// CHECK-LABEL: test_vceqq_u16 -// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) { return vceqq_u16(a, b); } -// CHECK-LABEL: test_vceqq_u32 -// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) { return vceqq_u32(a, b); } -// CHECK-LABEL: test_vceqq_p8 -// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) { return vceqq_p8(a, b); } -// CHECK-LABEL: test_vcge_s8 -// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) { return vcge_s8(a, b); } -// CHECK-LABEL: test_vcge_s16 -// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) { return vcge_s16(a, b); } -// CHECK-LABEL: test_vcge_s32 -// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) { return vcge_s32(a, b); } -// CHECK-LABEL: test_vcge_f32 -// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) { return vcge_f32(a, b); } -// CHECK-LABEL: test_vcge_u8 -// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) { return vcge_u8(a, b); } -// CHECK-LABEL: test_vcge_u16 -// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) { return vcge_u16(a, b); } -// CHECK-LABEL: test_vcge_u32 -// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) { return vcge_u32(a, b); } -// CHECK-LABEL: test_vcgeq_s8 -// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) { return vcgeq_s8(a, b); } -// CHECK-LABEL: test_vcgeq_s16 -// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) { return vcgeq_s16(a, b); } -// CHECK-LABEL: test_vcgeq_s32 -// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) { return vcgeq_s32(a, b); } -// CHECK-LABEL: test_vcgeq_f32 -// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) { return vcgeq_f32(a, b); } -// CHECK-LABEL: test_vcgeq_u8 -// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) { return vcgeq_u8(a, b); } -// CHECK-LABEL: test_vcgeq_u16 -// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) { return vcgeq_u16(a, b); } -// CHECK-LABEL: test_vcgeq_u32 -// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) { return vcgeq_u32(a, b); } -// CHECK-LABEL: test_vcgt_s8 -// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) { return vcgt_s8(a, b); } -// CHECK-LABEL: test_vcgt_s16 -// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) { return vcgt_s16(a, b); } -// CHECK-LABEL: test_vcgt_s32 -// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) { return vcgt_s32(a, b); } -// CHECK-LABEL: test_vcgt_f32 -// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) { return vcgt_f32(a, b); } -// CHECK-LABEL: test_vcgt_u8 -// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) { return vcgt_u8(a, b); } -// CHECK-LABEL: test_vcgt_u16 -// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) { return vcgt_u16(a, b); } -// CHECK-LABEL: test_vcgt_u32 -// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) { return vcgt_u32(a, b); } -// CHECK-LABEL: test_vcgtq_s8 -// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) { return vcgtq_s8(a, b); } -// CHECK-LABEL: test_vcgtq_s16 -// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) { return vcgtq_s16(a, b); } -// CHECK-LABEL: test_vcgtq_s32 -// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) { return vcgtq_s32(a, b); } -// CHECK-LABEL: test_vcgtq_f32 -// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) { return vcgtq_f32(a, b); } -// CHECK-LABEL: test_vcgtq_u8 -// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) { return vcgtq_u8(a, b); } -// CHECK-LABEL: test_vcgtq_u16 -// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) { return vcgtq_u16(a, b); } -// CHECK-LABEL: test_vcgtq_u32 -// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) { return vcgtq_u32(a, b); } -// CHECK-LABEL: test_vcle_s8 -// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) { return vcle_s8(a, b); } -// CHECK-LABEL: test_vcle_s16 -// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) { return vcle_s16(a, b); } -// CHECK-LABEL: test_vcle_s32 -// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) { return vcle_s32(a, b); } -// CHECK-LABEL: test_vcle_f32 -// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) { return vcle_f32(a, b); } -// CHECK-LABEL: test_vcle_u8 -// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) { return vcle_u8(a, b); } -// CHECK-LABEL: test_vcle_u16 -// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) { return vcle_u16(a, b); } -// CHECK-LABEL: test_vcle_u32 -// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) { return vcle_u32(a, b); } -// CHECK-LABEL: test_vcleq_s8 -// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) { return vcleq_s8(a, b); } -// CHECK-LABEL: test_vcleq_s16 -// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) { return vcleq_s16(a, b); } -// CHECK-LABEL: test_vcleq_s32 -// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) { return vcleq_s32(a, b); } -// CHECK-LABEL: test_vcleq_f32 -// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) { return vcleq_f32(a, b); } -// CHECK-LABEL: test_vcleq_u8 -// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) { return vcleq_u8(a, b); } -// CHECK-LABEL: test_vcleq_u16 -// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) { return vcleq_u16(a, b); } -// CHECK-LABEL: test_vcleq_u32 -// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) { return vcleq_u32(a, b); } -// CHECK-LABEL: test_vcls_s8 -// CHECK: vcls.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 { +// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VCLS_V_I]] int8x8_t test_vcls_s8(int8x8_t a) { return vcls_s8(a); } -// CHECK-LABEL: test_vcls_s16 -// CHECK: vcls.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #4 +// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vcls_s16(int16x4_t a) { return vcls_s16(a); } -// CHECK-LABEL: test_vcls_s32 -// CHECK: vcls.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #4 +// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vcls_s32(int32x2_t a) { return vcls_s32(a); } -// CHECK-LABEL: test_vclsq_s8 -// CHECK: vcls.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VCLSQ_V_I]] int8x16_t test_vclsq_s8(int8x16_t a) { return vclsq_s8(a); } -// CHECK-LABEL: test_vclsq_s16 -// CHECK: vcls.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4 +// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vclsq_s16(int16x8_t a) { return vclsq_s16(a); } -// CHECK-LABEL: test_vclsq_s32 -// CHECK: vcls.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4 +// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vclsq_s32(int32x4_t a) { return vclsq_s32(a); } -// CHECK-LABEL: test_vclt_s8 -// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) { return vclt_s8(a, b); } -// CHECK-LABEL: test_vclt_s16 -// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) { return vclt_s16(a, b); } -// CHECK-LABEL: test_vclt_s32 -// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) { return vclt_s32(a, b); } -// CHECK-LABEL: test_vclt_f32 -// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) { return vclt_f32(a, b); } -// CHECK-LABEL: test_vclt_u8 -// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[SEXT_I]] uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) { return vclt_u8(a, b); } -// CHECK-LABEL: test_vclt_u16 -// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[SEXT_I]] uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) { return vclt_u16(a, b); } -// CHECK-LABEL: test_vclt_u32 -// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[SEXT_I]] uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) { return vclt_u32(a, b); } -// CHECK-LABEL: test_vcltq_s8 -// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) { return vcltq_s8(a, b); } -// CHECK-LABEL: test_vcltq_s16 -// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) { return vcltq_s16(a, b); } -// CHECK-LABEL: test_vcltq_s32 -// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) { return vcltq_s32(a, b); } -// CHECK-LABEL: test_vcltq_f32 -// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) { return vcltq_f32(a, b); } -// CHECK-LABEL: test_vcltq_u8 -// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK: ret <16 x i8> [[SEXT_I]] uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) { return vcltq_u8(a, b); } -// CHECK-LABEL: test_vcltq_u16 -// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[SEXT_I]] uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) { return vcltq_u16(a, b); } -// CHECK-LABEL: test_vcltq_u32 -// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b +// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[SEXT_I]] uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) { return vcltq_u32(a, b); } -// CHECK-LABEL: test_vclz_s8 -// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 { +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] int8x8_t test_vclz_s8(int8x8_t a) { return vclz_s8(a); } -// CHECK-LABEL: test_vclz_s16 -// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vclz_s16(int16x4_t a) { return vclz_s16(a); } -// CHECK-LABEL: test_vclz_s32 -// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vclz_s32(int32x2_t a) { return vclz_s32(a); } -// CHECK-LABEL: test_vclz_u8 -// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 { +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] uint8x8_t test_vclz_u8(uint8x8_t a) { return vclz_u8(a); } -// CHECK-LABEL: test_vclz_u16 -// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] uint16x4_t test_vclz_u16(uint16x4_t a) { return vclz_u16(a); } -// CHECK-LABEL: test_vclz_u32 -// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vclz_u32(uint32x2_t a) { return vclz_u32(a); } -// CHECK-LABEL: test_vclzq_s8 -// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4 +// CHECK: ret <16 x i8> [[VCLZQ_V_I]] int8x16_t test_vclzq_s8(int8x16_t a) { return vclzq_s8(a); } -// CHECK-LABEL: test_vclzq_s16 -// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vclzq_s16(int16x8_t a) { return vclzq_s16(a); } -// CHECK-LABEL: test_vclzq_s32 -// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vclzq_s32(int32x4_t a) { return vclzq_s32(a); } -// CHECK-LABEL: test_vclzq_u8 -// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 { +// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4 +// CHECK: ret <16 x i8> [[VCLZQ_V_I]] uint8x16_t test_vclzq_u8(uint8x16_t a) { return vclzq_u8(a); } -// CHECK-LABEL: test_vclzq_u16 -// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] uint16x8_t test_vclzq_u16(uint16x8_t a) { return vclzq_u16(a); } -// CHECK-LABEL: test_vclzq_u32 -// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4 +// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] uint32x4_t test_vclzq_u32(uint32x4_t a) { return vclzq_u32(a); } -// CHECK-LABEL: test_vcnt_u8 -// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VCNT_V_I]] uint8x8_t test_vcnt_u8(uint8x8_t a) { return vcnt_u8(a); } -// CHECK-LABEL: test_vcnt_s8 -// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VCNT_V_I]] int8x8_t test_vcnt_s8(int8x8_t a) { return vcnt_s8(a); } -// CHECK-LABEL: test_vcnt_p8 -// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 { +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VCNT_V_I]] poly8x8_t test_vcnt_p8(poly8x8_t a) { return vcnt_p8(a); } -// CHECK-LABEL: test_vcntq_u8 -// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] uint8x16_t test_vcntq_u8(uint8x16_t a) { return vcntq_u8(a); } -// CHECK-LABEL: test_vcntq_s8 -// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] int8x16_t test_vcntq_s8(int8x16_t a) { return vcntq_s8(a); } -// CHECK-LABEL: test_vcntq_p8 -// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 { +// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VCNTQ_V_I]] poly8x16_t test_vcntq_p8(poly8x16_t a) { return vcntq_p8(a); } -// CHECK-LABEL: test_vcombine_s8 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) { return vcombine_s8(a, b); } -// CHECK-LABEL: test_vcombine_s16 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) { return vcombine_s16(a, b); } -// CHECK-LABEL: test_vcombine_s32 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) { return vcombine_s32(a, b); } -// CHECK-LABEL: test_vcombine_s64 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) { return vcombine_s64(a, b); } -// CHECK-LABEL: test_vcombine_f16 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> +// CHECK: ret <8 x half> [[SHUFFLE_I]] float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); } -// CHECK-LABEL: test_vcombine_f32 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) { return vcombine_f32(a, b); } -// CHECK-LABEL: test_vcombine_u8 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) { return vcombine_u8(a, b); } -// CHECK-LABEL: test_vcombine_u16 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) { return vcombine_u16(a, b); } -// CHECK-LABEL: test_vcombine_u32 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) { return vcombine_u32(a, b); } -// CHECK-LABEL: test_vcombine_u64 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> +// CHECK: ret <2 x i64> [[SHUFFLE_I]] uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) { return vcombine_u64(a, b); } -// CHECK-LABEL: test_vcombine_p8 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) { return vcombine_p8(a, b); } -// CHECK-LABEL: test_vcombine_p16 -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} -// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) { return vcombine_p16(a, b); } -// CHECK-LABEL: test_vcreate_s8 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] int8x8_t test_vcreate_s8(uint64_t a) { return vclz_s8(vcreate_s8(a)); } -// CHECK-LABEL: test_vcreate_s16 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vcreate_s16(uint64_t a) { return vclz_s16(vcreate_s16(a)); } -// CHECK-LABEL: test_vcreate_s32 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vcreate_s32(uint64_t a) { return vclz_s32(vcreate_s32(a)); } -// CHECK-LABEL: test_vcreate_f16 +// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vcreate_f16(uint64_t a) { return vcreate_f16(a); } -// CHECK-LABEL: test_vcreate_f32 +// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vcreate_f32(uint64_t a) { return vcreate_f32(a); } -// CHECK-LABEL: test_vcreate_u8 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4 +// CHECK: ret <8 x i8> [[VCLZ_V_I]] uint8x8_t test_vcreate_u8(uint64_t a) { return vclz_s8(vcreate_u8(a)); } -// CHECK-LABEL: test_vcreate_u16 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vcreate_u16(uint64_t a) { return vclz_s16(vcreate_u16(a)); } -// CHECK-LABEL: test_vcreate_u32 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4 +// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vcreate_u32(uint64_t a) { return vclz_s32(vcreate_u32(a)); } @@ -1614,10145 +2449,21932 @@ // We have two ways of lowering that. Either with one 'vmov d, r, r' or // with two 'vmov d[],r'. LLVM does the latter. We may want to be less // strict about the matching pattern if it starts causing problem. -// CHECK-LABEL: test_vcreate_u64 -// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0 -// CHECK: vmov.32 [[REG]][1], r1 +// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] +// CHECK: ret <1 x i64> [[ADD_I]] uint64x1_t test_vcreate_u64(uint64_t a) { uint64x1_t tmp = vcreate_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: test_vcreate_p8 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 -// CHECK: vcnt.8 d{{[0-9]+}}, [[REG]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> +// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4 +// CHECK: ret <8 x i8> [[VCNT_V_I]] poly8x8_t test_vcreate_p8(uint64_t a) { return vcnt_p8(vcreate_p8(a)); } -// CHECK-LABEL: test_vcreate_p16 -// CHECK: vmov [[REG:d[0-9]+]], r0, r1 +// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK: [[TMP4:%.*]] = xor <4 x i16> [[VBSL_I]], +// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP4]], [[VBSL2_I]] +// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK: ret <4 x i16> [[VBSL5_I]] poly16x4_t test_vcreate_p16(uint64_t a) { poly16x4_t tmp = vcreate_p16(a); return vbsl_p16(tmp, tmp, tmp); } -// CHECK-LABEL: test_vcreate_s64 -// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0 -// CHECK: vmov.32 [[REG]][1], r1 +// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] +// CHECK: ret <1 x i64> [[ADD_I]] int64x1_t test_vcreate_s64(uint64_t a) { int64x1_t tmp = vcreate_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: test_vcvt_f16_f32 -// CHECK: vcvt.f16.f32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4 +// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> +// CHECK: ret <4 x half> [[TMP1]] float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); } -// CHECK-LABEL: test_vcvt_f32_s32 -// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK: ret <2 x float> [[VCVT_I]] float32x2_t test_vcvt_f32_s32(int32x2_t a) { return vcvt_f32_s32(a); } -// CHECK-LABEL: test_vcvt_f32_u32 -// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK: ret <2 x float> [[VCVT_I]] float32x2_t test_vcvt_f32_u32(uint32x2_t a) { return vcvt_f32_u32(a); } -// CHECK-LABEL: test_vcvtq_f32_s32 -// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK: ret <4 x float> [[VCVT_I]] float32x4_t test_vcvtq_f32_s32(int32x4_t a) { return vcvtq_f32_s32(a); } -// CHECK-LABEL: test_vcvtq_f32_u32 -// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK: ret <4 x float> [[VCVT_I]] float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { return vcvtq_f32_u32(a); } -// CHECK-LABEL: test_vcvt_f32_f16 -// CHECK: vcvt.f32.f16 +// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4 +// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP1]] float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); } -// CHECK-LABEL: test_vcvt_n_f32_s32 -// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) +// CHECK: ret <2 x float> [[VCVT_N]]1 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { return vcvt_n_f32_s32(a, 1); } -// CHECK-LABEL: test_vcvt_n_f32_u32 -// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) +// CHECK: ret <2 x float> [[VCVT_N]]1 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { return vcvt_n_f32_u32(a, 1); } -// CHECK-LABEL: test_vcvtq_n_f32_s32 -// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) +// CHECK: ret <4 x float> [[VCVT_N]]1 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { return vcvtq_n_f32_s32(a, 3); } -// CHECK-LABEL: test_vcvtq_n_f32_u32 -// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) +// CHECK: ret <4 x float> [[VCVT_N]]1 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { return vcvtq_n_f32_u32(a, 3); } -// CHECK-LABEL: test_vcvt_n_s32_f32 -// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) +// CHECK: ret <2 x i32> [[VCVT_N]]1 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { return vcvt_n_s32_f32(a, 1); } -// CHECK-LABEL: test_vcvtq_n_s32_f32 -// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) +// CHECK: ret <4 x i32> [[VCVT_N]]1 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { return vcvtq_n_s32_f32(a, 3); } -// CHECK-LABEL: test_vcvt_n_u32_f32 -// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) +// CHECK: ret <2 x i32> [[VCVT_N]]1 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { return vcvt_n_u32_f32(a, 1); } -// CHECK-LABEL: test_vcvtq_n_u32_f32 -// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) +// CHECK: ret <4 x i32> [[VCVT_N]]1 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { return vcvtq_n_u32_f32(a, 3); } -// CHECK-LABEL: test_vcvt_s32_f32 -// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); } -// CHECK-LABEL: test_vcvtq_s32_f32 -// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); } -// CHECK-LABEL: test_vcvt_u32_f32 -// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); } -// CHECK-LABEL: test_vcvtq_u32_f32 -// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); } -// CHECK-LABEL: test_vdup_lane_u8 -// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE]] uint8x8_t test_vdup_lane_u8(uint8x8_t a) { return vdup_lane_u8(a, 7); } -// CHECK-LABEL: test_vdup_lane_u16 -// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE]] uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } -// CHECK-LABEL: test_vdup_lane_u32 -// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE]] uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } -// CHECK-LABEL: test_vdup_lane_s8 -// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE]] int8x8_t test_vdup_lane_s8(int8x8_t a) { return vdup_lane_s8(a, 7); } -// CHECK-LABEL: test_vdup_lane_s16 -// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE]] int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } -// CHECK-LABEL: test_vdup_lane_s32 -// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE]] int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } -// CHECK-LABEL: test_vdup_lane_p8 -// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE]] poly8x8_t test_vdup_lane_p8(poly8x8_t a) { return vdup_lane_p8(a, 7); } -// CHECK-LABEL: test_vdup_lane_p16 -// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE]] poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } -// CHECK-LABEL: test_vdup_lane_f32 -// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE]] float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } -// CHECK-LABEL: test_vdupq_lane_u8 -// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE]] uint8x16_t test_vdupq_lane_u8(uint8x8_t a) { return vdupq_lane_u8(a, 7); } -// CHECK-LABEL: test_vdupq_lane_u16 -// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE]] uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } -// CHECK-LABEL: test_vdupq_lane_u32 -// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE]] uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } -// CHECK-LABEL: test_vdupq_lane_s8 -// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE]] int8x16_t test_vdupq_lane_s8(int8x8_t a) { return vdupq_lane_s8(a, 7); } -// CHECK-LABEL: test_vdupq_lane_s16 -// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE]] int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } -// CHECK-LABEL: test_vdupq_lane_s32 -// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE]] int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } -// CHECK-LABEL: test_vdupq_lane_p8 -// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE]] poly8x16_t test_vdupq_lane_p8(poly8x8_t a) { return vdupq_lane_p8(a, 7); } -// CHECK-LABEL: test_vdupq_lane_p16 -// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE]] poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } -// CHECK-LABEL: test_vdupq_lane_f32 -// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE]] float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } -// CHECK-LABEL: test_vdup_lane_s64 +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE]] int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } -// CHECK-LABEL: test_vdup_lane_u64 +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE]] uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } -// CHECK-LABEL: test_vdupq_lane_s64 -// CHECK: {{vmov|vdup}} +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[SHUFFLE]] int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } -// CHECK-LABEL: test_vdupq_lane_u64 -// CHECK: {{vmov|vdup}} +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[SHUFFLE]] uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } -// CHECK-LABEL: test_vdup_n_u8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] uint8x8_t test_vdup_n_u8(uint8_t a) { return vdup_n_u8(a); } -// CHECK-LABEL: test_vdup_n_u16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] uint16x4_t test_vdup_n_u16(uint16_t a) { return vdup_n_u16(a); } -// CHECK-LABEL: test_vdup_n_u32 -// CHECK: mov +// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VECINIT1_I]] uint32x2_t test_vdup_n_u32(uint32_t a) { return vdup_n_u32(a); } -// CHECK-LABEL: test_vdup_n_s8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] int8x8_t test_vdup_n_s8(int8_t a) { return vdup_n_s8(a); } -// CHECK-LABEL: test_vdup_n_s16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] int16x4_t test_vdup_n_s16(int16_t a) { return vdup_n_s16(a); } -// CHECK-LABEL: test_vdup_n_s32 -// CHECK: mov +// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VECINIT1_I]] int32x2_t test_vdup_n_s32(int32_t a) { return vdup_n_s32(a); } -// CHECK-LABEL: test_vdup_n_p8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] poly8x8_t test_vdup_n_p8(poly8_t a) { return vdup_n_p8(a); } -// CHECK-LABEL: test_vdup_n_p16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] poly16x4_t test_vdup_n_p16(poly16_t a) { return vdup_n_p16(a); } -// CHECK-LABEL: test_vdup_n_f16 -// CHECK: vld1.16 {{{d[0-9]+\[\]}}} +// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: ret <4 x half> [[VECINIT]]3 float16x4_t test_vdup_n_f16(float16_t *a) { return vdup_n_f16(*a); } -// CHECK-LABEL: test_vdup_n_f32 -// CHECK: mov +// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 +// CHECK: ret <2 x float> [[VECINIT1_I]] float32x2_t test_vdup_n_f32(float32_t a) { return vdup_n_f32(a); } -// CHECK-LABEL: test_vdupq_n_u8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] uint8x16_t test_vdupq_n_u8(uint8_t a) { return vdupq_n_u8(a); } -// CHECK-LABEL: test_vdupq_n_u16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] uint16x8_t test_vdupq_n_u16(uint16_t a) { return vdupq_n_u16(a); } -// CHECK-LABEL: test_vdupq_n_u32 -// CHECK: vmov +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VECINIT3_I]] uint32x4_t test_vdupq_n_u32(uint32_t a) { return vdupq_n_u32(a); } -// CHECK-LABEL: test_vdupq_n_s8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] int8x16_t test_vdupq_n_s8(int8_t a) { return vdupq_n_s8(a); } -// CHECK-LABEL: test_vdupq_n_s16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] int16x8_t test_vdupq_n_s16(int16_t a) { return vdupq_n_s16(a); } -// CHECK-LABEL: test_vdupq_n_s32 -// CHECK: vmov +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VECINIT3_I]] int32x4_t test_vdupq_n_s32(int32_t a) { return vdupq_n_s32(a); } -// CHECK-LABEL: test_vdupq_n_p8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] poly8x16_t test_vdupq_n_p8(poly8_t a) { return vdupq_n_p8(a); } -// CHECK-LABEL: test_vdupq_n_p16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] poly16x8_t test_vdupq_n_p16(poly16_t a) { return vdupq_n_p16(a); } -// CHECK-LABEL: test_vdupq_n_f16 -// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}} +// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4 +// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5 +// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6 +// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7 +// CHECK: ret <8 x half> [[VECINIT]]7 float16x8_t test_vdupq_n_f16(float16_t *a) { return vdupq_n_f16(*a); } -// CHECK-LABEL: test_vdupq_n_f32 -// CHECK: vmov +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 +// CHECK: ret <4 x float> [[VECINIT3_I]] float32x4_t test_vdupq_n_f32(float32_t a) { return vdupq_n_f32(a); } -// CHECK-LABEL: test_vdup_n_s64 -// CHECK: vmov +// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK: ret <1 x i64> [[ADD_I]] int64x1_t test_vdup_n_s64(int64_t a) { int64x1_t tmp = vdup_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: test_vdup_n_u64 -// CHECK: vmov +// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK: ret <1 x i64> [[ADD_I]] uint64x1_t test_vdup_n_u64(uint64_t a) { int64x1_t tmp = vdup_n_u64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: test_vdupq_n_s64 -// CHECK: vmov +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vdupq_n_s64(int64_t a) { int64x2_t tmp = vdupq_n_s64(a); return vaddq_s64(tmp, tmp); } -// CHECK-LABEL: test_vdupq_n_u64 -// CHECK: vmov +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vdupq_n_u64(uint64_t a) { int64x2_t tmp = vdupq_n_u64(a); return vaddq_u64(tmp, tmp); } -// CHECK-LABEL: test_veor_s8 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[XOR_I]] int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) { return veor_s8(a, b); } -// CHECK-LABEL: test_veor_s16 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[XOR_I]] int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) { return veor_s16(a, b); } -// CHECK-LABEL: test_veor_s32 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[XOR_I]] int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) { return veor_s32(a, b); } -// CHECK-LABEL: test_veor_s64 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[XOR_I]] int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) { return veor_s64(a, b); } -// CHECK-LABEL: test_veor_u8 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[XOR_I]] uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) { return veor_u8(a, b); } -// CHECK-LABEL: test_veor_u16 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[XOR_I]] uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) { return veor_u16(a, b); } -// CHECK-LABEL: test_veor_u32 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[XOR_I]] uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) { return veor_u32(a, b); } -// CHECK-LABEL: test_veor_u64 -// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[XOR_I]] uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) { return veor_u64(a, b); } -// CHECK-LABEL: test_veorq_s8 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[XOR_I]] int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) { return veorq_s8(a, b); } -// CHECK-LABEL: test_veorq_s16 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[XOR_I]] int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) { return veorq_s16(a, b); } -// CHECK-LABEL: test_veorq_s32 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[XOR_I]] int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) { return veorq_s32(a, b); } -// CHECK-LABEL: test_veorq_s64 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[XOR_I]] int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) { return veorq_s64(a, b); } -// CHECK-LABEL: test_veorq_u8 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[XOR_I]] uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) { return veorq_u8(a, b); } -// CHECK-LABEL: test_veorq_u16 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[XOR_I]] uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) { return veorq_u16(a, b); } -// CHECK-LABEL: test_veorq_u32 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[XOR_I]] uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) { return veorq_u32(a, b); } -// CHECK-LABEL: test_veorq_u64 -// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[XOR_I]] uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) { return veorq_u64(a, b); } -// CHECK-LABEL: test_vext_s8 -// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { return vext_s8(a, b, 7); } -// CHECK-LABEL: test_vext_u8 -// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, 7); } -// CHECK-LABEL: test_vext_p8 -// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: ret <8 x i8> [[VEXT]] poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, 7); } -// CHECK-LABEL: test_vext_s16 -// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { return vext_s16(a, b, 3); } -// CHECK-LABEL: test_vext_u16 -// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, 3); } -// CHECK-LABEL: test_vext_p16 -// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i16> [[VEXT]] poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, 3); } -// CHECK-LABEL: test_vext_s32 -// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i32> [[VEXT]] int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { return vext_s32(a, b, 1); } -// CHECK-LABEL: test_vext_u32 -// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i32> [[VEXT]] uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, 1); } -// CHECK-LABEL: test_vext_s64 +// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[VEXT]] int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { return vext_s64(a, b, 0); } -// CHECK-LABEL: test_vext_u64 +// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[VEXT]] uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: test_vext_f32 -// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> +// CHECK: ret <2 x float> [[VEXT]] float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { return vext_f32(a, b, 1); } -// CHECK-LABEL: test_vextq_s8 -// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, 15); } -// CHECK-LABEL: test_vextq_u8 -// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, 15); } -// CHECK-LABEL: test_vextq_p8 -// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: ret <16 x i8> [[VEXT]] poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, 15); } -// CHECK-LABEL: test_vextq_s16 -// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, 7); } -// CHECK-LABEL: test_vextq_u16 -// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, 7); } -// CHECK-LABEL: test_vextq_p16 -// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK: ret <8 x i16> [[VEXT]] poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, 7); } -// CHECK-LABEL: test_vextq_s32 -// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i32> [[VEXT]] int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, 3); } -// CHECK-LABEL: test_vextq_u32 -// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK: ret <4 x i32> [[VEXT]] uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, 3); } -// CHECK-LABEL: test_vextq_s64 -// CHECK: {{vmov|vdup}} +// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i64> [[VEXT]] int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, 1); } -// CHECK-LABEL: test_vextq_u64 -// CHECK: {{vmov|vdup}} +// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK: ret <2 x i64> [[VEXT]] uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, 1); } -// CHECK-LABEL: test_vextq_f32 -// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +// CHECK: ret <4 x float> [[VEXT]] float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, 3); } -// CHECK-LABEL: test_vfma_f32 -// CHECK: vfma.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4 +// CHECK: ret <2 x float> [[TMP6]] float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vfma_f32(a, b, c); } -// CHECK-LABEL: test_vfmaq_f32 -// CHECK: vfma.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4 +// CHECK: ret <4 x float> [[TMP6]] float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmaq_f32(a, b, c); } -// CHECK-LABEL: test_vget_high_s8 +// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vget_high_s8(int8x16_t a) { return vget_high_s8(a); } -// CHECK-LABEL: test_vget_high_s16 +// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vget_high_s16(int16x8_t a) { return vget_high_s16(a); } -// CHECK-LABEL: test_vget_high_s32 +// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vget_high_s32(int32x4_t a) { return vget_high_s32(a); } -// CHECK-LABEL: test_vget_high_s64 +// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: ret <1 x i64> [[SHUFFLE_I]] int64x1_t test_vget_high_s64(int64x2_t a) { return vget_high_s64(a); } -// CHECK-LABEL: test_vget_high_f16 +// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> +// CHECK: ret <4 x half> [[SHUFFLE_I]] float16x4_t test_vget_high_f16(float16x8_t a) { return vget_high_f16(a); } -// CHECK-LABEL: test_vget_high_f32 +// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vget_high_f32(float32x4_t a) { return vget_high_f32(a); } -// CHECK-LABEL: test_vget_high_u8 +// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vget_high_u8(uint8x16_t a) { return vget_high_u8(a); } -// CHECK-LABEL: test_vget_high_u16 +// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vget_high_u16(uint16x8_t a) { return vget_high_u16(a); } -// CHECK-LABEL: test_vget_high_u32 +// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vget_high_u32(uint32x4_t a) { return vget_high_u32(a); } -// CHECK-LABEL: test_vget_high_u64 +// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> +// CHECK: ret <1 x i64> [[SHUFFLE_I]] uint64x1_t test_vget_high_u64(uint64x2_t a) { return vget_high_u64(a); } -// CHECK-LABEL: test_vget_high_p8 +// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vget_high_p8(poly8x16_t a) { return vget_high_p8(a); } -// CHECK-LABEL: test_vget_high_p16 +// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vget_high_p16(poly16x8_t a) { return vget_high_p16(a); } -// CHECK-LABEL: test_vget_lane_u8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] uint8_t test_vget_lane_u8(uint8x8_t a) { return vget_lane_u8(a, 7); } -// CHECK-LABEL: test_vget_lane_u16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] uint16_t test_vget_lane_u16(uint16x4_t a) { return vget_lane_u16(a, 3); } -// CHECK-LABEL: test_vget_lane_u32 -// CHECK: mov +// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] uint32_t test_vget_lane_u32(uint32x2_t a) { return vget_lane_u32(a, 1); } -// CHECK-LABEL: test_vget_lane_s8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] int8_t test_vget_lane_s8(int8x8_t a) { return vget_lane_s8(a, 7); } -// CHECK-LABEL: test_vget_lane_s16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] int16_t test_vget_lane_s16(int16x4_t a) { return vget_lane_s16(a, 3); } -// CHECK-LABEL: test_vget_lane_s32 -// CHECK: mov +// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK: ret i32 [[VGET_LANE]] int32_t test_vget_lane_s32(int32x2_t a) { return vget_lane_s32(a, 1); } -// CHECK-LABEL: test_vget_lane_p8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 { +// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 +// CHECK: ret i8 [[VGET_LANE]] poly8_t test_vget_lane_p8(poly8x8_t a) { return vget_lane_p8(a, 7); } -// CHECK-LABEL: test_vget_lane_p16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK: ret i16 [[VGET_LANE]] poly16_t test_vget_lane_p16(poly16x4_t a) { return vget_lane_p16(a, 3); } -// CHECK-LABEL: test_vget_lane_f32 -// CHECK: vmov +// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +// CHECK: ret float [[VGET_LANE]] float32_t test_vget_lane_f32(float32x2_t a) { return vget_lane_f32(a, 1); } -// CHECK-LABEL: test_vget_lane_f16 -// CHECK: vmov +// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 { +// CHECK: [[__REINT_242:%.*]] = alloca <4 x half>, align 8 +// CHECK: [[__REINT1_242:%.*]] = alloca i16, align 2 +// CHECK: store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>* +// CHECK: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1 +// CHECK: store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2 +// CHECK: [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half* +// CHECK: [[TMP5:%.*]] = load half, half* [[TMP4]], align 2 +// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float +// CHECK: ret float [[CONV]] float32_t test_vget_lane_f16(float16x4_t a) { return vget_lane_f16(a, 1); } -// CHECK-LABEL: test_vgetq_lane_u8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] uint8_t test_vgetq_lane_u8(uint8x16_t a) { return vgetq_lane_u8(a, 15); } -// CHECK-LABEL: test_vgetq_lane_u16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] uint16_t test_vgetq_lane_u16(uint16x8_t a) { return vgetq_lane_u16(a, 7); } -// CHECK-LABEL: test_vgetq_lane_u32 -// CHECK: vmov +// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] uint32_t test_vgetq_lane_u32(uint32x4_t a) { return vgetq_lane_u32(a, 3); } -// CHECK-LABEL: test_vgetq_lane_s8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] int8_t test_vgetq_lane_s8(int8x16_t a) { return vgetq_lane_s8(a, 15); } -// CHECK-LABEL: test_vgetq_lane_s16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] int16_t test_vgetq_lane_s16(int16x8_t a) { return vgetq_lane_s16(a, 7); } -// CHECK-LABEL: test_vgetq_lane_s32 -// CHECK: vmov +// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK: ret i32 [[VGETQ_LANE]] int32_t test_vgetq_lane_s32(int32x4_t a) { return vgetq_lane_s32(a, 3); } -// CHECK-LABEL: test_vgetq_lane_p8 -// CHECK: vmov +// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 { +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 +// CHECK: ret i8 [[VGETQ_LANE]] poly8_t test_vgetq_lane_p8(poly8x16_t a) { return vgetq_lane_p8(a, 15); } -// CHECK-LABEL: test_vgetq_lane_p16 -// CHECK: vmov +// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK: ret i16 [[VGETQ_LANE]] poly16_t test_vgetq_lane_p16(poly16x8_t a) { return vgetq_lane_p16(a, 7); } -// CHECK-LABEL: test_vgetq_lane_f32 -// CHECK: vmov +// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +// CHECK: ret float [[VGETQ_LANE]] float32_t test_vgetq_lane_f32(float32x4_t a) { return vgetq_lane_f32(a, 3); } -// CHECK-LABEL: test_vgetq_lane_f16 -// CHECK: vmov +// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 { +// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16 +// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2 +// CHECK: store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>* +// CHECK: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +// CHECK: store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2 +// CHECK: [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half* +// CHECK: [[TMP5:%.*]] = load half, half* [[TMP4]], align 2 +// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float +// CHECK: ret float [[CONV]] float32_t test_vgetq_lane_f16(float16x8_t a) { return vgetq_lane_f16(a, 3); } -// CHECK-LABEL: test_vget_lane_s64 // The optimizer is able to remove all moves now. +// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] int64_t test_vget_lane_s64(int64x1_t a) { return vget_lane_s64(a, 0); } -// CHECK-LABEL: test_vget_lane_u64 // The optimizer is able to remove all moves now. +// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK: ret i64 [[VGET_LANE]] uint64_t test_vget_lane_u64(uint64x1_t a) { return vget_lane_u64(a, 0); } -// CHECK-LABEL: test_vgetq_lane_s64 -// CHECK: vmov +// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] int64_t test_vgetq_lane_s64(int64x2_t a) { return vgetq_lane_s64(a, 1); } -// CHECK-LABEL: test_vgetq_lane_u64 -// CHECK: vmov +// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK: ret i64 [[VGETQ_LANE]] uint64_t test_vgetq_lane_u64(uint64x2_t a) { return vgetq_lane_u64(a, 1); } -// CHECK-LABEL: test_vget_low_s8 +// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vget_low_s8(int8x16_t a) { return vget_low_s8(a); } -// CHECK-LABEL: test_vget_low_s16 +// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vget_low_s16(int16x8_t a) { return vget_low_s16(a); } -// CHECK-LABEL: test_vget_low_s32 +// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vget_low_s32(int32x4_t a) { return vget_low_s32(a); } -// CHECK-LABEL: test_vget_low_s64 +// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE_I]] int64x1_t test_vget_low_s64(int64x2_t a) { return vget_low_s64(a); } -// CHECK-LABEL: test_vget_low_f16 +// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> +// CHECK: ret <4 x half> [[SHUFFLE_I]] float16x4_t test_vget_low_f16(float16x8_t a) { return vget_low_f16(a); } -// CHECK-LABEL: test_vget_low_f32 +// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vget_low_f32(float32x4_t a) { return vget_low_f32(a); } -// CHECK-LABEL: test_vget_low_u8 +// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vget_low_u8(uint8x16_t a) { return vget_low_u8(a); } -// CHECK-LABEL: test_vget_low_u16 +// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vget_low_u16(uint16x8_t a) { return vget_low_u16(a); } -// CHECK-LABEL: test_vget_low_u32 +// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vget_low_u32(uint32x4_t a) { return vget_low_u32(a); } -// CHECK-LABEL: test_vget_low_u64 +// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[SHUFFLE_I]] uint64x1_t test_vget_low_u64(uint64x2_t a) { return vget_low_u64(a); } -// CHECK-LABEL: test_vget_low_p8 +// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vget_low_p8(poly8x16_t a) { return vget_low_p8(a); } -// CHECK-LABEL: test_vget_low_p16 +// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vget_low_p16(poly16x8_t a) { return vget_low_p16(a); } -// CHECK-LABEL: test_vhadd_s8 -// CHECK: vhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VHADD_V_I]] int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) { return vhadd_s8(a, b); } -// CHECK-LABEL: test_vhadd_s16 -// CHECK: vhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) { return vhadd_s16(a, b); } -// CHECK-LABEL: test_vhadd_s32 -// CHECK: vhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) { return vhadd_s32(a, b); } -// CHECK-LABEL: test_vhadd_u8 -// CHECK: vhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VHADD_V_I]] uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) { return vhadd_u8(a, b); } -// CHECK-LABEL: test_vhadd_u16 -// CHECK: vhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) { return vhadd_u16(a, b); } -// CHECK-LABEL: test_vhadd_u32 -// CHECK: vhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4 +// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) { return vhadd_u32(a, b); } -// CHECK-LABEL: test_vhaddq_s8 -// CHECK: vhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VHADDQ_V_I]] int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) { return vhaddq_s8(a, b); } -// CHECK-LABEL: test_vhaddq_s16 -// CHECK: vhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) { return vhaddq_s16(a, b); } -// CHECK-LABEL: test_vhaddq_s32 -// CHECK: vhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) { return vhaddq_s32(a, b); } -// CHECK-LABEL: test_vhaddq_u8 -// CHECK: vhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VHADDQ_V_I]] uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) { return vhaddq_u8(a, b); } -// CHECK-LABEL: test_vhaddq_u16 -// CHECK: vhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) { return vhaddq_u16(a, b); } -// CHECK-LABEL: test_vhaddq_u32 -// CHECK: vhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4 +// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) { return vhaddq_u32(a, b); } -// CHECK-LABEL: test_vhsub_s8 -// CHECK: vhsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VHSUB_V_I]] int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) { return vhsub_s8(a, b); } -// CHECK-LABEL: test_vhsub_s16 -// CHECK: vhsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) { return vhsub_s16(a, b); } -// CHECK-LABEL: test_vhsub_s32 -// CHECK: vhsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) { return vhsub_s32(a, b); } -// CHECK-LABEL: test_vhsub_u8 -// CHECK: vhsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VHSUB_V_I]] uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) { return vhsub_u8(a, b); } -// CHECK-LABEL: test_vhsub_u16 -// CHECK: vhsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) { return vhsub_u16(a, b); } -// CHECK-LABEL: test_vhsub_u32 -// CHECK: vhsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4 +// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) { return vhsub_u32(a, b); } -// CHECK-LABEL: test_vhsubq_s8 -// CHECK: vhsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) { return vhsubq_s8(a, b); } -// CHECK-LABEL: test_vhsubq_s16 -// CHECK: vhsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) { return vhsubq_s16(a, b); } -// CHECK-LABEL: test_vhsubq_s32 -// CHECK: vhsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) { return vhsubq_s32(a, b); } -// CHECK-LABEL: test_vhsubq_u8 -// CHECK: vhsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) { return vhsubq_u8(a, b); } -// CHECK-LABEL: test_vhsubq_u16 -// CHECK: vhsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) { return vhsubq_u16(a, b); } -// CHECK-LABEL: test_vhsubq_u32 -// CHECK: vhsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4 +// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) { return vhsubq_u32(a, b); } -// CHECK-LABEL: test_vld1q_u8 -// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] uint8x16_t test_vld1q_u8(uint8_t const * a) { return vld1q_u8(a); } -// CHECK-LABEL: test_vld1q_u16 -// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vld1q_u16(uint16_t const * a) { return vld1q_u16(a); } -// CHECK-LABEL: test_vld1q_u32 -// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]] +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vld1q_u32(uint32_t const * a) { return vld1q_u32(a); } -// CHECK-LABEL: test_vld1q_u64 -// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]] +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vld1q_u64(uint64_t const * a) { return vld1q_u64(a); } -// CHECK-LABEL: test_vld1q_s8 -// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] int8x16_t test_vld1q_s8(int8_t const * a) { return vld1q_s8(a); } -// CHECK-LABEL: test_vld1q_s16 -// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vld1q_s16(int16_t const * a) { return vld1q_s16(a); } -// CHECK-LABEL: test_vld1q_s32 -// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]] +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vld1q_s32(int32_t const * a) { return vld1q_s32(a); } -// CHECK-LABEL: test_vld1q_s64 -// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]] +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vld1q_s64(int64_t const * a) { return vld1q_s64(a); } -// CHECK-LABEL: test_vld1q_f16 -// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK: ret <8 x half> [[TMP3]] float16x8_t test_vld1q_f16(float16_t const * a) { return vld1q_f16(a); } -// CHECK-LABEL: test_vld1q_f32 -// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]] +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vld1q_f32(float32_t const * a) { return vld1q_f32(a); } -// CHECK-LABEL: test_vld1q_p8 -// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]] +// CHECK: ret <16 x i8> [[TMP1]] poly8x16_t test_vld1q_p8(poly8_t const * a) { return vld1q_p8(a); } -// CHECK-LABEL: test_vld1q_p16 -// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] poly16x8_t test_vld1q_p16(poly16_t const * a) { return vld1q_p16(a); } -// CHECK-LABEL: test_vld1_u8 -// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] uint8x8_t test_vld1_u8(uint8_t const * a) { return vld1_u8(a); } -// CHECK-LABEL: test_vld1_u16 -// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vld1_u16(uint16_t const * a) { return vld1_u16(a); } -// CHECK-LABEL: test_vld1_u32 -// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]] +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vld1_u32(uint32_t const * a) { return vld1_u32(a); } -// CHECK-LABEL: test_vld1_u64 -// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]] +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vld1_u64(uint64_t const * a) { return vld1_u64(a); } -// CHECK-LABEL: test_vld1_s8 -// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] int8x8_t test_vld1_s8(int8_t const * a) { return vld1_s8(a); } -// CHECK-LABEL: test_vld1_s16 -// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vld1_s16(int16_t const * a) { return vld1_s16(a); } -// CHECK-LABEL: test_vld1_s32 -// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]] +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vld1_s32(int32_t const * a) { return vld1_s32(a); } -// CHECK-LABEL: test_vld1_s64 -// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]] +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vld1_s64(int64_t const * a) { return vld1_s64(a); } -// CHECK-LABEL: test_vld1_f16 -// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK: ret <4 x half> [[TMP3]] float16x4_t test_vld1_f16(float16_t const * a) { return vld1_f16(a); } -// CHECK-LABEL: test_vld1_f32 -// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]] +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vld1_f32(float32_t const * a) { return vld1_f32(a); } -// CHECK-LABEL: test_vld1_p8 -// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]] +// CHECK: ret <8 x i8> [[TMP1]] poly8x8_t test_vld1_p8(poly8_t const * a) { return vld1_p8(a); } -// CHECK-LABEL: test_vld1_p16 -// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] poly16x4_t test_vld1_p16(poly16_t const * a) { return vld1_p16(a); } -// CHECK-LABEL: test_vld1q_dup_u8 -// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] uint8x16_t test_vld1q_dup_u8(uint8_t const * a) { return vld1q_dup_u8(a); } -// CHECK-LABEL: test_vld1q_dup_u16 -// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] uint16x8_t test_vld1q_dup_u16(uint16_t const * a) { return vld1q_dup_u16(a); } -// CHECK-LABEL: test_vld1q_dup_u32 -// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i32> [[LANE]] uint32x4_t test_vld1q_dup_u32(uint32_t const * a) { return vld1q_dup_u32(a); } -// CHECK-LABEL: test_vld1q_dup_u64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] uint64x2_t test_vld1q_dup_u64(uint64_t const * a) { return vld1q_dup_u64(a); } -// CHECK-LABEL: test_vld1q_dup_s8 -// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] int8x16_t test_vld1q_dup_s8(int8_t const * a) { return vld1q_dup_s8(a); } -// CHECK-LABEL: test_vld1q_dup_s16 -// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] int16x8_t test_vld1q_dup_s16(int16_t const * a) { return vld1q_dup_s16(a); } -// CHECK-LABEL: test_vld1q_dup_s32 -// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i32> [[LANE]] int32x4_t test_vld1q_dup_s32(int32_t const * a) { return vld1q_dup_s32(a); } -// CHECK-LABEL: test_vld1q_dup_s64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] int64x2_t test_vld1q_dup_s64(int64_t const * a) { return vld1q_dup_s64(a); } -// CHECK-LABEL: test_vld1q_dup_f16 -// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> +// CHECK: ret <8 x half> [[TMP4]] float16x8_t test_vld1q_dup_f16(float16_t const * a) { return vld1q_dup_f16(a); } -// CHECK-LABEL: test_vld1q_dup_f32 -// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP2:%.*]] = load float, float* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x float> [[LANE]] float32x4_t test_vld1q_dup_f32(float32_t const * a) { return vld1q_dup_f32(a); } -// CHECK-LABEL: test_vld1q_dup_p8 -// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK: ret <16 x i8> [[LANE]] poly8x16_t test_vld1q_dup_p8(poly8_t const * a) { return vld1q_dup_p8(a); } -// CHECK-LABEL: test_vld1q_dup_p16 -// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i16> [[LANE]] poly16x8_t test_vld1q_dup_p16(poly16_t const * a) { return vld1q_dup_p16(a); } -// CHECK-LABEL: test_vld1_dup_u8 -// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] uint8x8_t test_vld1_dup_u8(uint8_t const * a) { return vld1_dup_u8(a); } -// CHECK-LABEL: test_vld1_dup_u16 -// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] uint16x4_t test_vld1_dup_u16(uint16_t const * a) { return vld1_dup_u16(a); } -// CHECK-LABEL: test_vld1_dup_u32 -// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i32> [[LANE]] uint32x2_t test_vld1_dup_u32(uint32_t const * a) { return vld1_dup_u32(a); } -// CHECK-LABEL: test_vld1_dup_u64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] uint64x1_t test_vld1_dup_u64(uint64_t const * a) { return vld1_dup_u64(a); } -// CHECK-LABEL: test_vld1_dup_s8 -// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] int8x8_t test_vld1_dup_s8(int8_t const * a) { return vld1_dup_s8(a); } -// CHECK-LABEL: test_vld1_dup_s16 -// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] int16x4_t test_vld1_dup_s16(int16_t const * a) { return vld1_dup_s16(a); } -// CHECK-LABEL: test_vld1_dup_s32 -// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP2:%.*]] = load i32, i32* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i32> [[LANE]] int32x2_t test_vld1_dup_s32(int32_t const * a) { return vld1_dup_s32(a); } -// CHECK-LABEL: test_vld1_dup_s64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP2:%.*]] = load i64, i64* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] int64x1_t test_vld1_dup_s64(int64_t const * a) { return vld1_dup_s64(a); } -// CHECK-LABEL: test_vld1_dup_f16 -// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> +// CHECK: ret <4 x half> [[TMP4]] float16x4_t test_vld1_dup_f16(float16_t const * a) { return vld1_dup_f16(a); } -// CHECK-LABEL: test_vld1_dup_f32 -// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP2:%.*]] = load float, float* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: ret <2 x float> [[LANE]] float32x2_t test_vld1_dup_f32(float32_t const * a) { return vld1_dup_f32(a); } -// CHECK-LABEL: test_vld1_dup_p8 -// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK: ret <8 x i8> [[LANE]] poly8x8_t test_vld1_dup_p8(poly8_t const * a) { return vld1_dup_p8(a); } -// CHECK-LABEL: test_vld1_dup_p16 -// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]] +// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x i16> [[LANE]] poly16x4_t test_vld1_dup_p16(poly16_t const * a) { return vld1_dup_p16(a); } -// CHECK-LABEL: test_vld1q_lane_u8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) { return vld1q_lane_u8(a, b, 15); } -// CHECK-LABEL: test_vld1q_lane_u16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) { return vld1q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vld1q_lane_u32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +// CHECK: ret <4 x i32> [[VLD1_LANE]] uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) { return vld1q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vld1q_lane_u64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 +// CHECK: ret <2 x i64> [[VLD1_LANE]] uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) { return vld1q_lane_u64(a, b, 1); } -// CHECK-LABEL: test_vld1q_lane_s8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) { return vld1q_lane_s8(a, b, 15); } -// CHECK-LABEL: test_vld1q_lane_s16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) { return vld1q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vld1q_lane_s32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 +// CHECK: ret <4 x i32> [[VLD1_LANE]] int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) { return vld1q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vld1q_lane_s64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 +// CHECK: ret <2 x i64> [[VLD1_LANE]] int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) { return vld1q_lane_s64(a, b, 1); } -// CHECK-LABEL: test_vld1q_lane_f16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half> +// CHECK: ret <8 x half> [[TMP5]] float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vld1q_lane_f32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP4:%.*]] = load float, float* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 +// CHECK: ret <4 x float> [[VLD1_LANE]] float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) { return vld1q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vld1q_lane_p8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 +// CHECK: ret <16 x i8> [[VLD1_LANE]] poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) { return vld1q_lane_p8(a, b, 15); } -// CHECK-LABEL: test_vld1q_lane_p16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 +// CHECK: ret <8 x i16> [[VLD1_LANE]] poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) { return vld1q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vld1_lane_u8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) { return vld1_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vld1_lane_u16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) { return vld1_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vld1_lane_u32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 +// CHECK: ret <2 x i32> [[VLD1_LANE]] uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) { return vld1_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vld1_lane_u64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 +// CHECK: ret <1 x i64> [[VLD1_LANE]] uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) { return vld1_lane_u64(a, b, 0); } -// CHECK-LABEL: test_vld1_lane_s8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) { return vld1_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vld1_lane_s16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) { return vld1_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vld1_lane_s32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: [[TMP4:%.*]] = load i32, i32* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 +// CHECK: ret <2 x i32> [[VLD1_LANE]] int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) { return vld1_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vld1_lane_s64 -// CHECK: {{ldr|vldr|vmov}} +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: [[TMP4:%.*]] = load i64, i64* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 +// CHECK: ret <1 x i64> [[VLD1_LANE]] int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) { return vld1_lane_s64(a, b, 0); } -// CHECK-LABEL: test_vld1_lane_f16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half> +// CHECK: ret <4 x half> [[TMP5]] float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) { return vld1_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vld1_lane_f32 -// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: [[TMP4:%.*]] = load float, float* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 +// CHECK: ret <2 x float> [[VLD1_LANE]] float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) { return vld1_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vld1_lane_p8 -// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = load i8, i8* %a +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 +// CHECK: ret <8 x i8> [[VLD1_LANE]] poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) { return vld1_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vld1_lane_p16 -// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]] +// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 +// CHECK: ret <4 x i16> [[VLD1_LANE]] poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) { return vld1_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vld2q_u8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP5]] uint8x16x2_t test_vld2q_u8(uint8_t const * a) { return vld2q_u8(a); } -// CHECK-LABEL: test_vld2q_u16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP6]] uint16x8x2_t test_vld2q_u16(uint16_t const * a) { return vld2q_u16(a); } -// CHECK-LABEL: test_vld2q_u32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP6]] uint32x4x2_t test_vld2q_u32(uint32_t const * a) { return vld2q_u32(a); } -// CHECK-LABEL: test_vld2q_s8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP5]] int8x16x2_t test_vld2q_s8(int8_t const * a) { return vld2q_s8(a); } -// CHECK-LABEL: test_vld2q_s16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP6]] int16x8x2_t test_vld2q_s16(int16_t const * a) { return vld2q_s16(a); } -// CHECK-LABEL: test_vld2q_s32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP6]] int32x4x2_t test_vld2q_s32(int32_t const * a) { return vld2q_s32(a); } -// CHECK-LABEL: test_vld2q_f16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP6]] float16x8x2_t test_vld2q_f16(float16_t const * a) { return vld2q_f16(a); } -// CHECK-LABEL: test_vld2q_f32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP6]] float32x4x2_t test_vld2q_f32(float32_t const * a) { return vld2q_f32(a); } -// CHECK-LABEL: test_vld2q_p8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP5]] poly8x16x2_t test_vld2q_p8(poly8_t const * a) { return vld2q_p8(a); } -// CHECK-LABEL: test_vld2q_p16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP6]] poly16x8x2_t test_vld2q_p16(poly16_t const * a) { return vld2q_p16(a); } -// CHECK-LABEL: test_vld2_u8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP5]] uint8x8x2_t test_vld2_u8(uint8_t const * a) { return vld2_u8(a); } -// CHECK-LABEL: test_vld2_u16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP6]] uint16x4x2_t test_vld2_u16(uint16_t const * a) { return vld2_u16(a); } -// CHECK-LABEL: test_vld2_u32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP6]] uint32x2x2_t test_vld2_u32(uint32_t const * a) { return vld2_u32(a); } -// CHECK-LABEL: test_vld2_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP6]] uint64x1x2_t test_vld2_u64(uint64_t const * a) { return vld2_u64(a); } -// CHECK-LABEL: test_vld2_s8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP5]] int8x8x2_t test_vld2_s8(int8_t const * a) { return vld2_s8(a); } -// CHECK-LABEL: test_vld2_s16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP6]] int16x4x2_t test_vld2_s16(int16_t const * a) { return vld2_s16(a); } -// CHECK-LABEL: test_vld2_s32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP6]] int32x2x2_t test_vld2_s32(int32_t const * a) { return vld2_s32(a); } -// CHECK-LABEL: test_vld2_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP6]] int64x1x2_t test_vld2_s64(int64_t const * a) { return vld2_s64(a); } -// CHECK-LABEL: test_vld2_f16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP6]] float16x4x2_t test_vld2_f16(float16_t const * a) { return vld2_f16(a); } -// CHECK-LABEL: test_vld2_f32 -// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP6]] float32x2x2_t test_vld2_f32(float32_t const * a) { return vld2_f32(a); } -// CHECK-LABEL: test_vld2_p8 -// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP5]] poly8x8x2_t test_vld2_p8(poly8_t const * a) { return vld2_p8(a); } -// CHECK-LABEL: test_vld2_p16 -// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP6]] poly16x4x2_t test_vld2_p16(poly16_t const * a) { return vld2_p16(a); } -// CHECK-LABEL: test_vld2_dup_u8 -// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP4]] uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) { return vld2_dup_u8(a); } -// CHECK-LABEL: test_vld2_dup_u16 -// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP6]] uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) { return vld2_dup_u16(a); } -// CHECK-LABEL: test_vld2_dup_u32 -// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP6]] uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) { return vld2_dup_u32(a); } -// CHECK-LABEL: test_vld2_dup_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x2_t [[TMP6]] uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) { return vld2_dup_u64(a); } -// CHECK-LABEL: test_vld2_dup_s8 -// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP4]] int8x8x2_t test_vld2_dup_s8(int8_t const * a) { return vld2_dup_s8(a); } -// CHECK-LABEL: test_vld2_dup_s16 -// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP6]] int16x4x2_t test_vld2_dup_s16(int16_t const * a) { return vld2_dup_s16(a); } -// CHECK-LABEL: test_vld2_dup_s32 -// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP6]] int32x2x2_t test_vld2_dup_s32(int32_t const * a) { return vld2_dup_s32(a); } -// CHECK-LABEL: test_vld2_dup_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x2_t [[TMP6]] int64x1x2_t test_vld2_dup_s64(int64_t const * a) { return vld2_dup_s64(a); } -// CHECK-LABEL: test_vld2_dup_f16 -// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP6]] float16x4x2_t test_vld2_dup_f16(float16_t const * a) { return vld2_dup_f16(a); } -// CHECK-LABEL: test_vld2_dup_f32 -// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP6]] float32x2x2_t test_vld2_dup_f32(float32_t const * a) { return vld2_dup_f32(a); } -// CHECK-LABEL: test_vld2_dup_p8 -// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP4]] poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) { return vld2_dup_p8(a); } -// CHECK-LABEL: test_vld2_dup_p16 -// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP6]] poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) { return vld2_dup_p16(a); } -// CHECK-LABEL: test_vld2q_lane_u16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]3 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) { return vld2q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vld2q_lane_u32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]3 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) { return vld2q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vld2q_lane_s16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]3 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) { return vld2q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vld2q_lane_s32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]3 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) { return vld2q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vld2q_lane_f16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x2_t [[TMP1]]3 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) { return vld2q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vld2q_lane_f32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]3 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) { return vld2q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vld2q_lane_p16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]3 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) { return vld2q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vld2_lane_u8 -// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) { return vld2_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vld2_lane_u16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]3 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) { return vld2_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vld2_lane_u32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]3 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) { return vld2_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vld2_lane_s8 -// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) { return vld2_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vld2_lane_s16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]3 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) { return vld2_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vld2_lane_s32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]3 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) { return vld2_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vld2_lane_f16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x2_t [[TMP1]]3 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) { return vld2_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vld2_lane_f32 -// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]3 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) { return vld2_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vld2_lane_p8 -// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]] +// CHECK: [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false) +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) { return vld2_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vld2_lane_p16 -// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]]) +// CHECK: [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0 +// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8* +// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false) +// CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]3 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) { return vld2_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vld3q_u8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x3_t [[TMP5]] uint8x16x3_t test_vld3q_u8(uint8_t const * a) { return vld3q_u8(a); } -// CHECK-LABEL: test_vld3q_u16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP6]] uint16x8x3_t test_vld3q_u16(uint16_t const * a) { return vld3q_u16(a); } -// CHECK-LABEL: test_vld3q_u32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP6]] uint32x4x3_t test_vld3q_u32(uint32_t const * a) { return vld3q_u32(a); } -// CHECK-LABEL: test_vld3q_s8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x3_t [[TMP5]] int8x16x3_t test_vld3q_s8(int8_t const * a) { return vld3q_s8(a); } -// CHECK-LABEL: test_vld3q_s16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP6]] int16x8x3_t test_vld3q_s16(int16_t const * a) { return vld3q_s16(a); } -// CHECK-LABEL: test_vld3q_s32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP6]] int32x4x3_t test_vld3q_s32(int32_t const * a) { return vld3q_s32(a); } -// CHECK-LABEL: test_vld3q_f16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP6]] float16x8x3_t test_vld3q_f16(float16_t const * a) { return vld3q_f16(a); } -// CHECK-LABEL: test_vld3q_f32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP6]] float32x4x3_t test_vld3q_f32(float32_t const * a) { return vld3q_f32(a); } -// CHECK-LABEL: test_vld3q_p8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x3_t [[TMP5]] poly8x16x3_t test_vld3q_p8(poly8_t const * a) { return vld3q_p8(a); } -// CHECK-LABEL: test_vld3q_p16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP6]] poly16x8x3_t test_vld3q_p16(poly16_t const * a) { return vld3q_p16(a); } -// CHECK-LABEL: test_vld3_u8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP5]] uint8x8x3_t test_vld3_u8(uint8_t const * a) { return vld3_u8(a); } -// CHECK-LABEL: test_vld3_u16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP6]] uint16x4x3_t test_vld3_u16(uint16_t const * a) { return vld3_u16(a); } -// CHECK-LABEL: test_vld3_u32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP6]] uint32x2x3_t test_vld3_u32(uint32_t const * a) { return vld3_u32(a); } -// CHECK-LABEL: test_vld3_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP6]] uint64x1x3_t test_vld3_u64(uint64_t const * a) { return vld3_u64(a); } -// CHECK-LABEL: test_vld3_s8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP5]] int8x8x3_t test_vld3_s8(int8_t const * a) { return vld3_s8(a); } -// CHECK-LABEL: test_vld3_s16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP6]] int16x4x3_t test_vld3_s16(int16_t const * a) { return vld3_s16(a); } -// CHECK-LABEL: test_vld3_s32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP6]] int32x2x3_t test_vld3_s32(int32_t const * a) { return vld3_s32(a); } -// CHECK-LABEL: test_vld3_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP6]] int64x1x3_t test_vld3_s64(int64_t const * a) { return vld3_s64(a); } -// CHECK-LABEL: test_vld3_f16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP6]] float16x4x3_t test_vld3_f16(float16_t const * a) { return vld3_f16(a); } -// CHECK-LABEL: test_vld3_f32 -// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP6]] float32x2x3_t test_vld3_f32(float32_t const * a) { return vld3_f32(a); } -// CHECK-LABEL: test_vld3_p8 -// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP5]] poly8x8x3_t test_vld3_p8(poly8_t const * a) { return vld3_p8(a); } -// CHECK-LABEL: test_vld3_p16 -// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP6]] poly16x4x3_t test_vld3_p16(poly16_t const * a) { return vld3_p16(a); } -// CHECK-LABEL: test_vld3_dup_u8 -// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP4]] uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) { return vld3_dup_u8(a); } -// CHECK-LABEL: test_vld3_dup_u16 -// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP6]] uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) { return vld3_dup_u16(a); } -// CHECK-LABEL: test_vld3_dup_u32 -// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP6]] uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) { return vld3_dup_u32(a); } -// CHECK-LABEL: test_vld3_dup_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x3_t [[TMP6]] uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) { return vld3_dup_u64(a); } -// CHECK-LABEL: test_vld3_dup_s8 -// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP4]] int8x8x3_t test_vld3_dup_s8(int8_t const * a) { return vld3_dup_s8(a); } -// CHECK-LABEL: test_vld3_dup_s16 -// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP6]] int16x4x3_t test_vld3_dup_s16(int16_t const * a) { return vld3_dup_s16(a); } -// CHECK-LABEL: test_vld3_dup_s32 -// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP6]] int32x2x3_t test_vld3_dup_s32(int32_t const * a) { return vld3_dup_s32(a); } -// CHECK-LABEL: test_vld3_dup_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x3_t [[TMP6]] int64x1x3_t test_vld3_dup_s64(int64_t const * a) { return vld3_dup_s64(a); } -// CHECK-LABEL: test_vld3_dup_f16 -// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP6]] float16x4x3_t test_vld3_dup_f16(float16_t const * a) { return vld3_dup_f16(a); } -// CHECK-LABEL: test_vld3_dup_f32 -// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP6]] float32x2x3_t test_vld3_dup_f32(float32_t const * a) { return vld3_dup_f32(a); } -// CHECK-LABEL: test_vld3_dup_p8 -// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP4]] poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) { return vld3_dup_p8(a); } -// CHECK-LABEL: test_vld3_dup_p16 -// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP6]] poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) { return vld3_dup_p16(a); } -// CHECK-LABEL: test_vld3q_lane_u16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x3_t [[TMP1]]6 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) { return vld3q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vld3q_lane_u32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x3_t [[TMP1]]6 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) { return vld3q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vld3q_lane_s16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x3_t [[TMP1]]6 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) { return vld3q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vld3q_lane_s32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x3_t [[TMP1]]6 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) { return vld3q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vld3q_lane_f16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x3_t [[TMP1]]6 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) { return vld3q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vld3q_lane_f32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x3_t [[TMP1]]6 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) { return vld3q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vld3q_lane_p16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x3_t [[TMP1]]6 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) { return vld3q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vld3_lane_u8 -// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x3_t [[TMP9]] uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) { return vld3_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vld3_lane_u16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x3_t [[TMP1]]6 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) { return vld3_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vld3_lane_u32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x3_t [[TMP1]]6 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) { return vld3_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vld3_lane_s8 -// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x3_t [[TMP9]] int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) { return vld3_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vld3_lane_s16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x3_t [[TMP1]]6 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) { return vld3_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vld3_lane_s32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x3_t [[TMP1]]6 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) { return vld3_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vld3_lane_f16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x3_t [[TMP1]]6 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) { return vld3_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vld3_lane_f32 -// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> +// CHECK: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x3_t [[TMP1]]6 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) { return vld3_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vld3_lane_p8 -// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false) +// CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x3_t [[TMP9]] poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) { return vld3_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vld3_lane_p16 -// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3 +// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8* +// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false) +// CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x3_t [[TMP1]]6 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) { return vld3_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vld4q_u8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x4_t [[TMP5]] uint8x16x4_t test_vld4q_u8(uint8_t const * a) { return vld4q_u8(a); } -// CHECK-LABEL: test_vld4q_u16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP6]] uint16x8x4_t test_vld4q_u16(uint16_t const * a) { return vld4q_u16(a); } -// CHECK-LABEL: test_vld4q_u32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP6]] uint32x4x4_t test_vld4q_u32(uint32_t const * a) { return vld4q_u32(a); } -// CHECK-LABEL: test_vld4q_s8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x4_t [[TMP5]] int8x16x4_t test_vld4q_s8(int8_t const * a) { return vld4q_s8(a); } -// CHECK-LABEL: test_vld4q_s16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP6]] int16x8x4_t test_vld4q_s16(int16_t const * a) { return vld4q_s16(a); } -// CHECK-LABEL: test_vld4q_s32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>* +// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP6]] int32x4x4_t test_vld4q_s32(int32_t const * a) { return vld4q_s32(a); } -// CHECK-LABEL: test_vld4q_f16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP6]] float16x8x4_t test_vld4q_f16(float16_t const * a) { return vld4q_f16(a); } -// CHECK-LABEL: test_vld4q_f32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>* +// CHECK: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP6]] float32x4x4_t test_vld4q_f32(float32_t const * a) { return vld4q_f32(a); } -// CHECK-LABEL: test_vld4q_p8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* +// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x4_t [[TMP5]] poly8x16x4_t test_vld4q_p8(poly8_t const * a) { return vld4q_p8(a); } -// CHECK-LABEL: test_vld4q_p16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>* +// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP6]] poly16x8x4_t test_vld4q_p16(poly16_t const * a) { return vld4q_p16(a); } -// CHECK-LABEL: test_vld4_u8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP5]] uint8x8x4_t test_vld4_u8(uint8_t const * a) { return vld4_u8(a); } -// CHECK-LABEL: test_vld4_u16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP6]] uint16x4x4_t test_vld4_u16(uint16_t const * a) { return vld4_u16(a); } -// CHECK-LABEL: test_vld4_u32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP6]] uint32x2x4_t test_vld4_u32(uint32_t const * a) { return vld4_u32(a); } -// CHECK-LABEL: test_vld4_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP6]] uint64x1x4_t test_vld4_u64(uint64_t const * a) { return vld4_u64(a); } -// CHECK-LABEL: test_vld4_s8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP5]] int8x8x4_t test_vld4_s8(int8_t const * a) { return vld4_s8(a); } -// CHECK-LABEL: test_vld4_s16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP6]] int16x4x4_t test_vld4_s16(int16_t const * a) { return vld4_s16(a); } -// CHECK-LABEL: test_vld4_s32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP6]] int32x2x4_t test_vld4_s32(int32_t const * a) { return vld4_s32(a); } -// CHECK-LABEL: test_vld4_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP6]] int64x1x4_t test_vld4_s64(int64_t const * a) { return vld4_s64(a); } -// CHECK-LABEL: test_vld4_f16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP6]] float16x4x4_t test_vld4_f16(float16_t const * a) { return vld4_f16(a); } -// CHECK-LABEL: test_vld4_f32 -// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>* +// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP6]] float32x2x4_t test_vld4_f32(float32_t const * a) { return vld4_f32(a); } -// CHECK-LABEL: test_vld4_p8 -// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]]) +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false) +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP5]] poly8x8x4_t test_vld4_p8(poly8_t const * a) { return vld4_p8(a); } -// CHECK-LABEL: test_vld4_p16 -// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP6]] poly16x4x4_t test_vld4_p16(poly16_t const * a) { return vld4_p16(a); } -// CHECK-LABEL: test_vld4_dup_u8 -// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP4]] uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) { return vld4_dup_u8(a); } -// CHECK-LABEL: test_vld4_dup_u16 -// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP6]] uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) { return vld4_dup_u16(a); } -// CHECK-LABEL: test_vld4_dup_u32 -// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP6]] uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) { return vld4_dup_u32(a); } -// CHECK-LABEL: test_vld4_dup_u64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint64x1x4_t [[TMP6]] uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) { return vld4_dup_u64(a); } -// CHECK-LABEL: test_vld4_dup_s8 -// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP4]] int8x8x4_t test_vld4_dup_s8(int8_t const * a) { return vld4_dup_s8(a); } -// CHECK-LABEL: test_vld4_dup_s16 -// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP6]] int16x4x4_t test_vld4_dup_s16(int16_t const * a) { return vld4_dup_s16(a); } -// CHECK-LABEL: test_vld4_dup_s32 -// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32* +// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP6]] int32x2x4_t test_vld4_dup_s32(int32_t const * a) { return vld4_dup_s32(a); } -// CHECK-LABEL: test_vld4_dup_s64 -// CHECK: vld1.64 +// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64* +// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* +// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int64x1x4_t [[TMP6]] int64x1x4_t test_vld4_dup_s64(int64_t const * a) { return vld4_dup_s64(a); } -// CHECK-LABEL: test_vld4_dup_f16 -// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP6]] float16x4x4_t test_vld4_dup_f16(float16_t const * a) { return vld4_dup_f16(a); } -// CHECK-LABEL: test_vld4_dup_f32 -// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float* +// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP6]] float32x2x4_t test_vld4_dup_f32(float32_t const * a) { return vld4_dup_f32(a); } -// CHECK-LABEL: test_vld4_dup_p8 -// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a) +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]] +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false) +// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP4]] poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) { return vld4_dup_p8(a); } -// CHECK-LABEL: test_vld4_dup_p16 -// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16* +// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]]) +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]] +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false) +// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP6]] poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) { return vld4_dup_p16(a); } -// CHECK-LABEL: test_vld4q_lane_u16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x4_t [[TMP1]]9 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) { return vld4q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vld4q_lane_u32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x4_t [[TMP1]]9 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) { return vld4q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vld4q_lane_s16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x4_t [[TMP1]]9 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) { return vld4q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vld4q_lane_s32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* +// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x4_t [[TMP1]]9 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) { return vld4q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vld4q_lane_f16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float16x8x4_t [[TMP1]]9 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) { return vld4q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vld4q_lane_f32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* +// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x4_t [[TMP1]]9 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) { return vld4q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vld4q_lane_p16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* +// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x4_t [[TMP1]]9 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) { return vld4q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vld4_lane_u8 -// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x4_t [[TMP1]]0 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) { return vld4_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vld4_lane_u16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x4_t [[TMP1]]9 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) { return vld4_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vld4_lane_u32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x4_t [[TMP1]]9 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) { return vld4_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vld4_lane_s8 -// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x4_t [[TMP1]]0 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) { return vld4_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vld4_lane_s16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x4_t [[TMP1]]9 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) { return vld4_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vld4_lane_s32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* +// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x4_t [[TMP1]]9 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) { return vld4_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vld4_lane_f16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float16x4x4_t [[TMP1]]9 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) { return vld4_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vld4_lane_f32 -// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float> +// CHECK: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* +// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x4_t [[TMP1]]9 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) { return vld4_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vld4_lane_p8 -// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a) +// CHECK: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* +// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]] +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false) +// CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x4_t [[TMP1]]0 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) { return vld4_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vld4_lane_p16 -// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16> +// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]]) +// CHECK: [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* +// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6 +// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8* +// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false) +// CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x4_t [[TMP1]]9 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) { return vld4_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vmax_s8 -// CHECK: vmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMAX_I]] int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); } -// CHECK-LABEL: test_vmax_s16 -// CHECK: vmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VMAX2_I]] int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); } -// CHECK-LABEL: test_vmax_s32 -// CHECK: vmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VMAX2_I]] int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); } -// CHECK-LABEL: test_vmax_u8 -// CHECK: vmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMAX_I]] uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); } -// CHECK-LABEL: test_vmax_u16 -// CHECK: vmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VMAX2_I]] uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); } -// CHECK-LABEL: test_vmax_u32 -// CHECK: vmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VMAX2_I]] uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); } -// CHECK-LABEL: test_vmax_f32 -// CHECK: vmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4 +// CHECK: ret <2 x float> [[VMAX2_I]] float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); } -// CHECK-LABEL: test_vmaxq_s8 -// CHECK: vmax.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMAX_I]] int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { return vmaxq_s8(a, b); } -// CHECK-LABEL: test_vmaxq_s16 -// CHECK: vmax.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VMAX2_I]] int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { return vmaxq_s16(a, b); } -// CHECK-LABEL: test_vmaxq_s32 -// CHECK: vmax.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VMAX2_I]] int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { return vmaxq_s32(a, b); } -// CHECK-LABEL: test_vmaxq_u8 -// CHECK: vmax.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMAX_I]] uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { return vmaxq_u8(a, b); } -// CHECK-LABEL: test_vmaxq_u16 -// CHECK: vmax.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4 +// CHECK: ret <8 x i16> [[VMAX2_I]] uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { return vmaxq_u16(a, b); } -// CHECK-LABEL: test_vmaxq_u32 -// CHECK: vmax.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4 +// CHECK: ret <4 x i32> [[VMAX2_I]] uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { return vmaxq_u32(a, b); } -// CHECK-LABEL: test_vmaxq_f32 -// CHECK: vmax.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4 +// CHECK: ret <4 x float> [[VMAX2_I]] float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } -// CHECK-LABEL: test_vmin_s8 -// CHECK: vmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMIN_I]] int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); } -// CHECK-LABEL: test_vmin_s16 -// CHECK: vmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VMIN2_I]] int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); } -// CHECK-LABEL: test_vmin_s32 -// CHECK: vmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VMIN2_I]] int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); } -// CHECK-LABEL: test_vmin_u8 -// CHECK: vmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMIN_I]] uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); } -// CHECK-LABEL: test_vmin_u16 -// CHECK: vmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VMIN2_I]] uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); } -// CHECK-LABEL: test_vmin_u32 -// CHECK: vmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VMIN2_I]] uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); } -// CHECK-LABEL: test_vmin_f32 -// CHECK: vmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4 +// CHECK: ret <2 x float> [[VMIN2_I]] float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); } -// CHECK-LABEL: test_vminq_s8 -// CHECK: vmin.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMIN_I]] int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { return vminq_s8(a, b); } -// CHECK-LABEL: test_vminq_s16 -// CHECK: vmin.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VMIN2_I]] int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { return vminq_s16(a, b); } -// CHECK-LABEL: test_vminq_s32 -// CHECK: vmin.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VMIN2_I]] int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { return vminq_s32(a, b); } -// CHECK-LABEL: test_vminq_u8 -// CHECK: vmin.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMIN_I]] uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { return vminq_u8(a, b); } -// CHECK-LABEL: test_vminq_u16 -// CHECK: vmin.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4 +// CHECK: ret <8 x i16> [[VMIN2_I]] uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { return vminq_u16(a, b); } -// CHECK-LABEL: test_vminq_u32 -// CHECK: vmin.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4 +// CHECK: ret <4 x i32> [[VMIN2_I]] uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { return vminq_u32(a, b); } -// CHECK-LABEL: test_vminq_f32 -// CHECK: vmin.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4 +// CHECK: ret <4 x float> [[VMIN2_I]] float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } -// CHECK-LABEL: test_vmla_s8 -// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] +// CHECK: ret <8 x i8> [[ADD_I]] int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmla_s8(a, b, c); } -// CHECK-LABEL: test_vmla_s16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_s16(a, b, c); } -// CHECK-LABEL: test_vmla_s32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_s32(a, b, c); } -// CHECK-LABEL: test_vmla_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_f32(a, b, c); } -// CHECK-LABEL: test_vmla_u8 -// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c +// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] +// CHECK: ret <8 x i8> [[ADD_I]] uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmla_u8(a, b, c); } -// CHECK-LABEL: test_vmla_u16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_u16(a, b, c); } -// CHECK-LABEL: test_vmla_u32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_u32(a, b, c); } -// CHECK-LABEL: test_vmlaq_s8 -// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] +// CHECK: ret <16 x i8> [[ADD_I]] int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlaq_s8(a, b, c); } -// CHECK-LABEL: test_vmlaq_s16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlaq_s16(a, b, c); } -// CHECK-LABEL: test_vmlaq_s32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlaq_s32(a, b, c); } -// CHECK-LABEL: test_vmlaq_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlaq_f32(a, b, c); } -// CHECK-LABEL: test_vmlaq_u8 -// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c +// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] +// CHECK: ret <16 x i8> [[ADD_I]] uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlaq_u8(a, b, c); } -// CHECK-LABEL: test_vmlaq_u16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlaq_u16(a, b, c); } -// CHECK-LABEL: test_vmlaq_u32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlaq_u32(a, b, c); } -// CHECK-LABEL: test_vmlal_s8 -// CHECK: vmlal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); } -// CHECK-LABEL: test_vmlal_s16 -// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); } -// CHECK-LABEL: test_vmlal_s32 -// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); } -// CHECK-LABEL: test_vmlal_u8 -// CHECK: vmlal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); } -// CHECK-LABEL: test_vmlal_u16 -// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); } -// CHECK-LABEL: test_vmlal_u32 -// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); } -// CHECK-LABEL: test_vmlal_lane_s16 -// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmlal_lane_s32 -// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmlal_lane_u16 -// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmlal_lane_u32 -// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[ADD]] uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmlal_n_s16 -// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlal_n_s16(a, b, c); } -// CHECK-LABEL: test_vmlal_n_s32 -// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlal_n_s32(a, b, c); } -// CHECK-LABEL: test_vmlal_n_u16 -// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlal_n_u16(a, b, c); } -// CHECK-LABEL: test_vmlal_n_u32 -// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[ADD_I]] uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlal_n_u32(a, b, c); } -// CHECK-LABEL: test_vmla_lane_s16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmla_lane_s32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmla_lane_u16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmla_lane_u32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmla_lane_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } -// CHECK-LABEL: test_vmlaq_lane_s16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmlaq_lane_s32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmlaq_lane_u16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmlaq_lane_u32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmlaq_lane_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: test_vmla_n_s16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmla_n_s16(a, b, c); } -// CHECK-LABEL: test_vmla_n_s32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmla_n_s32(a, b, c); } -// CHECK-LABEL: test_vmla_n_u16 -// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[ADD_I]] uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmla_n_u16(a, b, c); } -// CHECK-LABEL: test_vmla_n_u32 -// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[ADD_I]] uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmla_n_u32(a, b, c); } -// CHECK-LABEL: test_vmla_n_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] +// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[ADD_I]] float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmla_n_f32(a, b, c); } -// CHECK-LABEL: test_vmlaq_n_s16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlaq_n_s16(a, b, c); } -// CHECK-LABEL: test_vmlaq_n_s32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlaq_n_s32(a, b, c); } -// CHECK-LABEL: test_vmlaq_n_u16 -// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[ADD_I]] uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); } -// CHECK-LABEL: test_vmlaq_n_u32 -// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[ADD_I]] uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlaq_n_u32(a, b, c); } -// CHECK-LABEL: test_vmlaq_n_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] -// CHECK-SWIFT: vadd.f32 -// CHECK-A57: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, -// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] +// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[ADD_I]] float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlaq_n_f32(a, b, c); } -// CHECK-LABEL: test_vmls_s8 -// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmls_s8(a, b, c); } -// CHECK-LABEL: test_vmls_s16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_s16(a, b, c); } -// CHECK-LABEL: test_vmls_s32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_s32(a, b, c); } -// CHECK-LABEL: test_vmls_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_f32(a, b, c); } -// CHECK-LABEL: test_vmls_u8 -// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] +// CHECK: ret <8 x i8> [[SUB_I]] uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmls_u8(a, b, c); } -// CHECK-LABEL: test_vmls_u16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_u16(a, b, c); } -// CHECK-LABEL: test_vmls_u32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_u32(a, b, c); } -// CHECK-LABEL: test_vmlsq_s8 -// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlsq_s8(a, b, c); } -// CHECK-LABEL: test_vmlsq_s16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlsq_s16(a, b, c); } -// CHECK-LABEL: test_vmlsq_s32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlsq_s32(a, b, c); } -// CHECK-LABEL: test_vmlsq_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlsq_f32(a, b, c); } -// CHECK-LABEL: test_vmlsq_u8 -// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] +// CHECK: ret <16 x i8> [[SUB_I]] uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlsq_u8(a, b, c); } -// CHECK-LABEL: test_vmlsq_u16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlsq_u16(a, b, c); } -// CHECK-LABEL: test_vmlsq_u32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlsq_u32(a, b, c); } -// CHECK-LABEL: test_vmlsl_s8 -// CHECK: vmlsl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); } -// CHECK-LABEL: test_vmlsl_s16 -// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); } -// CHECK-LABEL: test_vmlsl_s32 -// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); } -// CHECK-LABEL: test_vmlsl_u8 -// CHECK: vmlsl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4 +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); } -// CHECK-LABEL: test_vmlsl_u16 -// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); } -// CHECK-LABEL: test_vmlsl_u32 -// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); } -// CHECK-LABEL: test_vmlsl_lane_s16 -// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmlsl_lane_s32 -// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmlsl_lane_u16 -// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmlsl_lane_u32 -// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: ret <2 x i64> [[SUB]] uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmlsl_n_s16 -// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlsl_n_s16(a, b, c); } -// CHECK-LABEL: test_vmlsl_n_s32 -// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlsl_n_s32(a, b, c); } -// CHECK-LABEL: test_vmlsl_n_u16 -// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlsl_n_u16(a, b, c); } -// CHECK-LABEL: test_vmlsl_n_u32 -// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4 +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlsl_n_u32(a, b, c); } -// CHECK-LABEL: test_vmls_lane_s16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmls_lane_s32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmls_lane_u16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmls_lane_u32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmls_lane_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } -// CHECK-LABEL: test_vmlsq_lane_s16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vmlsq_lane_s32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vmlsq_lane_u16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: test_vmlsq_lane_u32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: test_vmlsq_lane_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: test_vmls_n_s16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmls_n_s16(a, b, c); } -// CHECK-LABEL: test_vmls_n_s32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmls_n_s32(a, b, c); } -// CHECK-LABEL: test_vmls_n_u16 -// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmls_n_u16(a, b, c); } -// CHECK-LABEL: test_vmls_n_u32 -// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmls_n_u32(a, b, c); } -// CHECK-LABEL: test_vmls_n_f32 -// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmls_n_f32(a, b, c); } -// CHECK-LABEL: test_vmlsq_n_s16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlsq_n_s16(a, b, c); } -// CHECK-LABEL: test_vmlsq_n_s32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlsq_n_s32(a, b, c); } -// CHECK-LABEL: test_vmlsq_n_u16 -// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlsq_n_u16(a, b, c); } -// CHECK-LABEL: test_vmlsq_n_u32 -// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlsq_n_u32(a, b, c); } -// CHECK-LABEL: test_vmlsq_n_f32 -// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] -// CHECK-SWIFT: vsub.f32 -// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlsq_n_f32(a, b, c); } -// CHECK-LABEL: test_vmovl_s8 -// CHECK: vmovl.s8 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 { +// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I]] int16x8_t test_vmovl_s8(int8x8_t a) { return vmovl_s8(a); } -// CHECK-LABEL: test_vmovl_s16 -// CHECK: vmovl.s16 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I]] int32x4_t test_vmovl_s16(int16x4_t a) { return vmovl_s16(a); } -// CHECK-LABEL: test_vmovl_s32 -// CHECK: vmovl.s32 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I]] int64x2_t test_vmovl_s32(int32x2_t a) { return vmovl_s32(a); } -// CHECK-LABEL: test_vmovl_u8 -// CHECK: vmovl.u8 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 { +// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[VMOVL_I]] uint16x8_t test_vmovl_u8(uint8x8_t a) { return vmovl_u8(a); } -// CHECK-LABEL: test_vmovl_u16 -// CHECK: vmovl.u16 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: ret <4 x i32> [[VMOVL_I]] uint32x4_t test_vmovl_u16(uint16x4_t a) { return vmovl_u16(a); } -// CHECK-LABEL: test_vmovl_u32 -// CHECK: vmovl.u32 q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: ret <2 x i64> [[VMOVL_I]] uint64x2_t test_vmovl_u32(uint32x2_t a) { return vmovl_u32(a); } -// CHECK-LABEL: test_vmovn_s16 -// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VMOVN_I]] int8x8_t test_vmovn_s16(int16x8_t a) { return vmovn_s16(a); } -// CHECK-LABEL: test_vmovn_s32 -// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: ret <4 x i16> [[VMOVN_I]] int16x4_t test_vmovn_s32(int32x4_t a) { return vmovn_s32(a); } -// CHECK-LABEL: test_vmovn_s64 -// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[VMOVN_I]] int32x2_t test_vmovn_s64(int64x2_t a) { return vmovn_s64(a); } -// CHECK-LABEL: test_vmovn_u16 -// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VMOVN_I]] uint8x8_t test_vmovn_u16(uint16x8_t a) { return vmovn_u16(a); } -// CHECK-LABEL: test_vmovn_u32 -// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK: ret <4 x i16> [[VMOVN_I]] uint16x4_t test_vmovn_u32(uint32x4_t a) { return vmovn_u32(a); } -// CHECK-LABEL: test_vmovn_u64 -// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK: ret <2 x i32> [[VMOVN_I]] uint32x2_t test_vmovn_u64(uint64x2_t a) { return vmovn_u64(a); } -// CHECK-LABEL: test_vmov_n_u8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] uint8x8_t test_vmov_n_u8(uint8_t a) { return vmov_n_u8(a); } -// CHECK-LABEL: test_vmov_n_u16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] uint16x4_t test_vmov_n_u16(uint16_t a) { return vmov_n_u16(a); } -// CHECK-LABEL: test_vmov_n_u32 -// CHECK: mov {{r[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VECINIT1_I]] uint32x2_t test_vmov_n_u32(uint32_t a) { return vmov_n_u32(a); } -// CHECK-LABEL: test_vmov_n_s8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] int8x8_t test_vmov_n_s8(int8_t a) { return vmov_n_s8(a); } -// CHECK-LABEL: test_vmov_n_s16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] int16x4_t test_vmov_n_s16(int16_t a) { return vmov_n_s16(a); } -// CHECK-LABEL: test_vmov_n_s32 -// CHECK: mov {{r[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VECINIT1_I]] int32x2_t test_vmov_n_s32(int32_t a) { return vmov_n_s32(a); } -// CHECK-LABEL: test_vmov_n_p8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VECINIT7_I]] poly8x8_t test_vmov_n_p8(poly8_t a) { return vmov_n_p8(a); } -// CHECK-LABEL: test_vmov_n_p16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VECINIT3_I]] poly16x4_t test_vmov_n_p16(poly16_t a) { return vmov_n_p16(a); } -// CHECK-LABEL: test_vmov_n_f16 -// CHECK: vld1.16 {{{d[0-9]+\[\]}}} +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: ret <4 x half> [[VECINIT]]3 float16x4_t test_vmov_n_f16(float16_t *a) { return vmov_n_f16(*a); } -// CHECK-LABEL: test_vmov_n_f32 -// CHECK: mov {{r[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 +// CHECK: ret <2 x float> [[VECINIT1_I]] float32x2_t test_vmov_n_f32(float32_t a) { return vmov_n_f32(a); } -// CHECK-LABEL: test_vmovq_n_u8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] uint8x16_t test_vmovq_n_u8(uint8_t a) { return vmovq_n_u8(a); } -// CHECK-LABEL: test_vmovq_n_u16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] uint16x8_t test_vmovq_n_u16(uint16_t a) { return vmovq_n_u16(a); } -// CHECK-LABEL: test_vmovq_n_u32 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VECINIT3_I]] uint32x4_t test_vmovq_n_u32(uint32_t a) { return vmovq_n_u32(a); } -// CHECK-LABEL: test_vmovq_n_s8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] int8x16_t test_vmovq_n_s8(int8_t a) { return vmovq_n_s8(a); } -// CHECK-LABEL: test_vmovq_n_s16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] int16x8_t test_vmovq_n_s16(int16_t a) { return vmovq_n_s16(a); } -// CHECK-LABEL: test_vmovq_n_s32 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VECINIT3_I]] int32x4_t test_vmovq_n_s32(int32_t a) { return vmovq_n_s32(a); } -// CHECK-LABEL: test_vmovq_n_p8 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 +// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 +// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 +// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 +// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 +// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 +// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 +// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 +// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VECINIT15_I]] poly8x16_t test_vmovq_n_p8(poly8_t a) { return vmovq_n_p8(a); } -// CHECK-LABEL: test_vmovq_n_p16 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VECINIT7_I]] poly16x8_t test_vmovq_n_p16(poly16_t a) { return vmovq_n_p16(a); } -// CHECK-LABEL: test_vmovq_n_f16 -// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}} +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 { +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0 +// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2 +// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3 +// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4 +// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5 +// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6 +// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7 +// CHECK: ret <8 x half> [[VECINIT]]7 float16x8_t test_vmovq_n_f16(float16_t *a) { return vmovq_n_f16(*a); } -// CHECK-LABEL: test_vmovq_n_f32 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 +// CHECK: ret <4 x float> [[VECINIT3_I]] float32x4_t test_vmovq_n_f32(float32_t a) { return vmovq_n_f32(a); } -// CHECK-LABEL: test_vmov_n_s64 -// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0 -// CHECK: vmov.32 [[REG]][1], r1 +// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK: ret <1 x i64> [[ADD_I]] int64x1_t test_vmov_n_s64(int64_t a) { int64x1_t tmp = vmov_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: test_vmov_n_u64 -// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0 -// CHECK: vmov.32 [[REG]][1], r1 +// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0 +// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK: ret <1 x i64> [[ADD_I]] uint64x1_t test_vmov_n_u64(uint64_t a) { uint64x1_t tmp = vmov_n_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: test_vmovq_n_s64 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VECINIT1_I]] int64x2_t test_vmovq_n_s64(int64_t a) { return vmovq_n_s64(a); } -// CHECK-LABEL: test_vmovq_n_u64 -// CHECK: vmov {{r[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VECINIT1_I]] uint64x2_t test_vmovq_n_u64(uint64_t a) { return vmovq_n_u64(a); } -// CHECK-LABEL: test_vmul_s8 -// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[MUL_I]] int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) { return vmul_s8(a, b); } -// CHECK-LABEL: test_vmul_s16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[MUL_I]] int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) { return vmul_s16(a, b); } -// CHECK-LABEL: test_vmul_s32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[MUL_I]] int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) { return vmul_s32(a, b); } -// CHECK-LABEL: test_vmul_f32 -// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, %b +// CHECK: ret <2 x float> [[MUL_I]] float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) { return vmul_f32(a, b); } -// CHECK-LABEL: test_vmul_u8 -// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[MUL_I]] uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) { return vmul_u8(a, b); } -// CHECK-LABEL: test_vmul_u16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[MUL_I]] uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) { return vmul_u16(a, b); } -// CHECK-LABEL: test_vmul_u32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[MUL_I]] uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) { return vmul_u32(a, b); } -// CHECK-LABEL: test_vmulq_s8 -// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[MUL_I]] int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) { return vmulq_s8(a, b); } -// CHECK-LABEL: test_vmulq_s16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[MUL_I]] int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) { return vmulq_s16(a, b); } -// CHECK-LABEL: test_vmulq_s32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[MUL_I]] int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) { return vmulq_s32(a, b); } -// CHECK-LABEL: test_vmulq_f32 -// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, %b +// CHECK: ret <4 x float> [[MUL_I]] float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) { return vmulq_f32(a, b); } -// CHECK-LABEL: test_vmulq_u8 -// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[MUL_I]] uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) { return vmulq_u8(a, b); } -// CHECK-LABEL: test_vmulq_u16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[MUL_I]] uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) { return vmulq_u16(a, b); } -// CHECK-LABEL: test_vmulq_u32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[MUL_I]] uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) { return vmulq_u32(a, b); } -// CHECK-LABEL: test_vmull_s8 -// CHECK: vmull.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); } -// CHECK-LABEL: test_vmull_s16 -// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); } -// CHECK-LABEL: test_vmull_s32 -// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); } -// CHECK-LABEL: test_vmull_u8 -// CHECK: vmull.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); } -// CHECK-LABEL: test_vmull_u16 -// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); } -// CHECK-LABEL: test_vmull_u32 -// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); } -// CHECK-LABEL: test_vmull_p8 -// CHECK: vmull.p8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i16> [[VMULL_I]] poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); } -// CHECK-LABEL: test_vmull_lane_s16 -// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vmull_lane_s32 -// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vmull_lane_u16 -// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vmull_lane_u32 -// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vmull_n_s16 -// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL5_I]] int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { return vmull_n_s16(a, b); } -// CHECK-LABEL: test_vmull_n_s32 -// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL3_I]] int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { return vmull_n_s32(a, b); } -// CHECK-LABEL: test_vmull_n_u16 -// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4 +// CHECK: ret <4 x i32> [[VMULL5_I]] uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { return vmull_n_u16(a, b); } -// CHECK-LABEL: test_vmull_n_u32 -// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4 +// CHECK: ret <2 x i64> [[VMULL3_I]] uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { return vmull_n_u32(a, b); } -// CHECK-LABEL: test_vmul_p8 -// CHECK: vmul.p8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VMUL_V_I]] poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) { return vmul_p8(a, b); } -// CHECK-LABEL: test_vmulq_p8 -// CHECK: vmul.p8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VMULQ_V_I]] poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) { return vmulq_p8(a, b); } -// CHECK-LABEL: test_vmul_lane_s16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vmul_lane_s32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vmul_lane_f32 -// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vmul_lane_u16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vmul_lane_u32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vmulq_lane_s16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vmulq_lane_s32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vmulq_lane_f32 -// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vmulq_lane_u16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vmulq_lane_u32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vmul_n_s16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i16> [[MUL_I]] int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { return vmul_n_s16(a, b); } -// CHECK-LABEL: test_vmul_n_s32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] +// CHECK: ret <2 x i32> [[MUL_I]] int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { return vmul_n_s32(a, b); } -// CHECK-LABEL: test_vmul_n_f32 -// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1 +// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]] +// CHECK: ret <2 x float> [[MUL_I]] float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { return vmul_n_f32(a, b); } -// CHECK-LABEL: test_vmul_n_u16 -// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i16> [[MUL_I]] uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { return vmul_n_u16(a, b); } -// CHECK-LABEL: test_vmul_n_u32 -// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] +// CHECK: ret <2 x i32> [[MUL_I]] uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { return vmul_n_u32(a, b); } -// CHECK-LABEL: test_vmulq_n_s16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] +// CHECK: ret <8 x i16> [[MUL_I]] int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { return vmulq_n_s16(a, b); } -// CHECK-LABEL: test_vmulq_n_s32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i32> [[MUL_I]] int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { return vmulq_n_s32(a, b); } -// CHECK-LABEL: test_vmulq_n_f32 -// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] +// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3 +// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]] +// CHECK: ret <4 x float> [[MUL_I]] float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { return vmulq_n_f32(a, b); } -// CHECK-LABEL: test_vmulq_n_u16 -// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] +// CHECK: ret <8 x i16> [[MUL_I]] uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { return vmulq_n_u16(a, b); } -// CHECK-LABEL: test_vmulq_n_u32 -// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] +// CHECK: ret <4 x i32> [[MUL_I]] uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { return vmulq_n_u32(a, b); } -// CHECK-LABEL: test_vmvn_s8 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] int8x8_t test_vmvn_s8(int8x8_t a) { return vmvn_s8(a); } -// CHECK-LABEL: test_vmvn_s16 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, +// CHECK: ret <4 x i16> [[NEG_I]] int16x4_t test_vmvn_s16(int16x4_t a) { return vmvn_s16(a); } -// CHECK-LABEL: test_vmvn_s32 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, +// CHECK: ret <2 x i32> [[NEG_I]] int32x2_t test_vmvn_s32(int32x2_t a) { return vmvn_s32(a); } -// CHECK-LABEL: test_vmvn_u8 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] uint8x8_t test_vmvn_u8(uint8x8_t a) { return vmvn_u8(a); } -// CHECK-LABEL: test_vmvn_u16 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, +// CHECK: ret <4 x i16> [[NEG_I]] uint16x4_t test_vmvn_u16(uint16x4_t a) { return vmvn_u16(a); } -// CHECK-LABEL: test_vmvn_u32 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, +// CHECK: ret <2 x i32> [[NEG_I]] uint32x2_t test_vmvn_u32(uint32x2_t a) { return vmvn_u32(a); } -// CHECK-LABEL: test_vmvn_p8 -// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, +// CHECK: ret <8 x i8> [[NEG_I]] poly8x8_t test_vmvn_p8(poly8x8_t a) { return vmvn_p8(a); } -// CHECK-LABEL: test_vmvnq_s8 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] int8x16_t test_vmvnq_s8(int8x16_t a) { return vmvnq_s8(a); } -// CHECK-LABEL: test_vmvnq_s16 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, +// CHECK: ret <8 x i16> [[NEG_I]] int16x8_t test_vmvnq_s16(int16x8_t a) { return vmvnq_s16(a); } -// CHECK-LABEL: test_vmvnq_s32 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, +// CHECK: ret <4 x i32> [[NEG_I]] int32x4_t test_vmvnq_s32(int32x4_t a) { return vmvnq_s32(a); } -// CHECK-LABEL: test_vmvnq_u8 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] uint8x16_t test_vmvnq_u8(uint8x16_t a) { return vmvnq_u8(a); } -// CHECK-LABEL: test_vmvnq_u16 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, +// CHECK: ret <8 x i16> [[NEG_I]] uint16x8_t test_vmvnq_u16(uint16x8_t a) { return vmvnq_u16(a); } -// CHECK-LABEL: test_vmvnq_u32 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, +// CHECK: ret <4 x i32> [[NEG_I]] uint32x4_t test_vmvnq_u32(uint32x4_t a) { return vmvnq_u32(a); } -// CHECK-LABEL: test_vmvnq_p8 -// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, +// CHECK: ret <16 x i8> [[NEG_I]] poly8x16_t test_vmvnq_p8(poly8x16_t a) { return vmvnq_p8(a); } -// CHECK-LABEL: test_vneg_s8 -// CHECK: vneg.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vneg_s8(int8x8_t a) { return vneg_s8(a); } -// CHECK-LABEL: test_vneg_s16 -// CHECK: vneg.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vneg_s16(int16x4_t a) { return vneg_s16(a); } -// CHECK-LABEL: test_vneg_s32 -// CHECK: vneg.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vneg_s32(int32x2_t a) { return vneg_s32(a); } -// CHECK-LABEL: test_vneg_f32 -// CHECK: vneg.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> , %a +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vneg_f32(float32x2_t a) { return vneg_f32(a); } -// CHECK-LABEL: test_vnegq_s8 -// CHECK: vneg.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vnegq_s8(int8x16_t a) { return vnegq_s8(a); } -// CHECK-LABEL: test_vnegq_s16 -// CHECK: vneg.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vnegq_s16(int16x8_t a) { return vnegq_s16(a); } -// CHECK-LABEL: test_vnegq_s32 -// CHECK: vneg.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vnegq_s32(int32x4_t a) { return vnegq_s32(a); } -// CHECK-LABEL: test_vnegq_f32 -// CHECK: vneg.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> , %a +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vnegq_f32(float32x4_t a) { return vnegq_f32(a); } -// CHECK-LABEL: test_vorn_s8 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[OR_I]] int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) { return vorn_s8(a, b); } -// CHECK-LABEL: test_vorn_s16 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[OR_I]] int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) { return vorn_s16(a, b); } -// CHECK-LABEL: test_vorn_s32 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[OR_I]] int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) { return vorn_s32(a, b); } -// CHECK-LABEL: test_vorn_s64 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[OR_I]] int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) { return vorn_s64(a, b); } -// CHECK-LABEL: test_vorn_u8 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] +// CHECK: ret <8 x i8> [[OR_I]] uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) { return vorn_u8(a, b); } -// CHECK-LABEL: test_vorn_u16 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] +// CHECK: ret <4 x i16> [[OR_I]] uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) { return vorn_u16(a, b); } -// CHECK-LABEL: test_vorn_u32 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] +// CHECK: ret <2 x i32> [[OR_I]] uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) { return vorn_u32(a, b); } -// CHECK-LABEL: test_vorn_u64 -// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] +// CHECK: ret <1 x i64> [[OR_I]] uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) { return vorn_u64(a, b); } -// CHECK-LABEL: test_vornq_s8 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[OR_I]] int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) { return vornq_s8(a, b); } -// CHECK-LABEL: test_vornq_s16 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[OR_I]] int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) { return vornq_s16(a, b); } -// CHECK-LABEL: test_vornq_s32 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[OR_I]] int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) { return vornq_s32(a, b); } -// CHECK-LABEL: test_vornq_s64 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[OR_I]] int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) { return vornq_s64(a, b); } -// CHECK-LABEL: test_vornq_u8 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] +// CHECK: ret <16 x i8> [[OR_I]] uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) { return vornq_u8(a, b); } -// CHECK-LABEL: test_vornq_u16 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] +// CHECK: ret <8 x i16> [[OR_I]] uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) { return vornq_u16(a, b); } -// CHECK-LABEL: test_vornq_u32 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] +// CHECK: ret <4 x i32> [[OR_I]] uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) { return vornq_u32(a, b); } -// CHECK-LABEL: test_vornq_u64 -// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] +// CHECK: ret <2 x i64> [[OR_I]] uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) { return vornq_u64(a, b); } -// CHECK-LABEL: test_vorr_s8 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[OR_I]] int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) { return vorr_s8(a, b); } -// CHECK-LABEL: test_vorr_s16 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[OR_I]] int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) { return vorr_s16(a, b); } -// CHECK-LABEL: test_vorr_s32 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[OR_I]] int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) { return vorr_s32(a, b); } -// CHECK-LABEL: test_vorr_s64 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[OR_I]] int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) { return vorr_s64(a, b); } -// CHECK-LABEL: test_vorr_u8 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[OR_I]] uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) { return vorr_u8(a, b); } -// CHECK-LABEL: test_vorr_u16 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[OR_I]] uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) { return vorr_u16(a, b); } -// CHECK-LABEL: test_vorr_u32 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[OR_I]] uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) { return vorr_u32(a, b); } -// CHECK-LABEL: test_vorr_u64 -// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[OR_I]] uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) { return vorr_u64(a, b); } -// CHECK-LABEL: test_vorrq_s8 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[OR_I]] int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) { return vorrq_s8(a, b); } -// CHECK-LABEL: test_vorrq_s16 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[OR_I]] int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) { return vorrq_s16(a, b); } -// CHECK-LABEL: test_vorrq_s32 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[OR_I]] int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) { return vorrq_s32(a, b); } -// CHECK-LABEL: test_vorrq_s64 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[OR_I]] int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) { return vorrq_s64(a, b); } -// CHECK-LABEL: test_vorrq_u8 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[OR_I]] uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) { return vorrq_u8(a, b); } -// CHECK-LABEL: test_vorrq_u16 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[OR_I]] uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) { return vorrq_u16(a, b); } -// CHECK-LABEL: test_vorrq_u32 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[OR_I]] uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) { return vorrq_u32(a, b); } -// CHECK-LABEL: test_vorrq_u64 -// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[OR_I]] uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) { return vorrq_u64(a, b); } -// CHECK-LABEL: test_vpadal_s8 -// CHECK: vpadal.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #4 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); } -// CHECK-LABEL: test_vpadal_s16 -// CHECK: vpadal.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i32> [[TMP3]] int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); } -// CHECK-LABEL: test_vpadal_s32 -// CHECK: vpadal.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <1 x i64> [[TMP3]] int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); } -// CHECK-LABEL: test_vpadal_u8 -// CHECK: vpadal.u8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #4 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); } -// CHECK-LABEL: test_vpadal_u16 -// CHECK: vpadal.u16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i32> [[TMP3]] uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); } -// CHECK-LABEL: test_vpadal_u32 -// CHECK: vpadal.u32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <1 x i64> [[TMP3]] uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); } -// CHECK-LABEL: test_vpadalq_s8 -// CHECK: vpadal.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #4 +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); } -// CHECK-LABEL: test_vpadalq_s16 -// CHECK: vpadal.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <4 x i32> [[TMP3]] int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); } -// CHECK-LABEL: test_vpadalq_s32 -// CHECK: vpadal.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i64> [[TMP3]] int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); } -// CHECK-LABEL: test_vpadalq_u8 -// CHECK: vpadal.u8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #4 +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); } -// CHECK-LABEL: test_vpadalq_u16 -// CHECK: vpadal.u16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <4 x i32> [[TMP3]] uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); } -// CHECK-LABEL: test_vpadalq_u32 -// CHECK: vpadal.u32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4 +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK: ret <2 x i64> [[TMP3]] uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); } -// CHECK-LABEL: test_vpadd_s8 -// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPADD_V_I]] int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); } -// CHECK-LABEL: test_vpadd_s16 -// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); } -// CHECK-LABEL: test_vpadd_s32 -// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); } -// CHECK-LABEL: test_vpadd_u8 -// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPADD_V_I]] uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); } -// CHECK-LABEL: test_vpadd_u16 -// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); } -// CHECK-LABEL: test_vpadd_u32 -// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); } -// CHECK-LABEL: test_vpadd_f32 -// CHECK: vpadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4 +// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); } -// CHECK-LABEL: test_vpaddl_s8 -// CHECK: vpaddl.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #4 +// CHECK: ret <4 x i16> [[VPADDL_I]] int16x4_t test_vpaddl_s8(int8x8_t a) { return vpaddl_s8(a); } -// CHECK-LABEL: test_vpaddl_s16 -// CHECK: vpaddl.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4 +// CHECK: ret <2 x i32> [[VPADDL1_I]] int32x2_t test_vpaddl_s16(int16x4_t a) { return vpaddl_s16(a); } -// CHECK-LABEL: test_vpaddl_s32 -// CHECK: vpaddl.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4 +// CHECK: ret <1 x i64> [[VPADDL1_I]] int64x1_t test_vpaddl_s32(int32x2_t a) { return vpaddl_s32(a); } -// CHECK-LABEL: test_vpaddl_u8 -// CHECK: vpaddl.u8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #4 +// CHECK: ret <4 x i16> [[VPADDL_I]] uint16x4_t test_vpaddl_u8(uint8x8_t a) { return vpaddl_u8(a); } -// CHECK-LABEL: test_vpaddl_u16 -// CHECK: vpaddl.u16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4 +// CHECK: ret <2 x i32> [[VPADDL1_I]] uint32x2_t test_vpaddl_u16(uint16x4_t a) { return vpaddl_u16(a); } -// CHECK-LABEL: test_vpaddl_u32 -// CHECK: vpaddl.u32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4 +// CHECK: ret <1 x i64> [[VPADDL1_I]] uint64x1_t test_vpaddl_u32(uint32x2_t a) { return vpaddl_u32(a); } -// CHECK-LABEL: test_vpaddlq_s8 -// CHECK: vpaddl.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #4 +// CHECK: ret <8 x i16> [[VPADDL_I]] int16x8_t test_vpaddlq_s8(int8x16_t a) { return vpaddlq_s8(a); } -// CHECK-LABEL: test_vpaddlq_s16 -// CHECK: vpaddl.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4 +// CHECK: ret <4 x i32> [[VPADDL1_I]] int32x4_t test_vpaddlq_s16(int16x8_t a) { return vpaddlq_s16(a); } -// CHECK-LABEL: test_vpaddlq_s32 -// CHECK: vpaddl.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4 +// CHECK: ret <2 x i64> [[VPADDL1_I]] int64x2_t test_vpaddlq_s32(int32x4_t a) { return vpaddlq_s32(a); } -// CHECK-LABEL: test_vpaddlq_u8 -// CHECK: vpaddl.u8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 { +// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #4 +// CHECK: ret <8 x i16> [[VPADDL_I]] uint16x8_t test_vpaddlq_u8(uint8x16_t a) { return vpaddlq_u8(a); } -// CHECK-LABEL: test_vpaddlq_u16 -// CHECK: vpaddl.u16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4 +// CHECK: ret <4 x i32> [[VPADDL1_I]] uint32x4_t test_vpaddlq_u16(uint16x8_t a) { return vpaddlq_u16(a); } -// CHECK-LABEL: test_vpaddlq_u32 -// CHECK: vpaddl.u32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4 +// CHECK: ret <2 x i64> [[VPADDL1_I]] uint64x2_t test_vpaddlq_u32(uint32x4_t a) { return vpaddlq_u32(a); } -// CHECK-LABEL: test_vpmax_s8 -// CHECK: vpmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMAX_I]] int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); } -// CHECK-LABEL: test_vpmax_s16 -// CHECK: vpmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMAX2_I]] int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); } -// CHECK-LABEL: test_vpmax_s32 -// CHECK: vpmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMAX2_I]] int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); } -// CHECK-LABEL: test_vpmax_u8 -// CHECK: vpmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMAX_I]] uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); } -// CHECK-LABEL: test_vpmax_u16 -// CHECK: vpmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMAX2_I]] uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); } -// CHECK-LABEL: test_vpmax_u32 -// CHECK: vpmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMAX2_I]] uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); } -// CHECK-LABEL: test_vpmax_f32 -// CHECK: vpmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4 +// CHECK: ret <2 x float> [[VPMAX2_I]] float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); } -// CHECK-LABEL: test_vpmin_s8 -// CHECK: vpmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMIN_I]] int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); } -// CHECK-LABEL: test_vpmin_s16 -// CHECK: vpmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMIN2_I]] int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); } -// CHECK-LABEL: test_vpmin_s32 -// CHECK: vpmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMIN2_I]] int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); } -// CHECK-LABEL: test_vpmin_u8 -// CHECK: vpmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VPMIN_I]] uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); } -// CHECK-LABEL: test_vpmin_u16 -// CHECK: vpmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4 +// CHECK: ret <4 x i16> [[VPMIN2_I]] uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); } -// CHECK-LABEL: test_vpmin_u32 -// CHECK: vpmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x i32> [[VPMIN2_I]] uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); } -// CHECK-LABEL: test_vpmin_f32 -// CHECK: vpmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4 +// CHECK: ret <2 x float> [[VPMIN2_I]] float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); } -// CHECK-LABEL: test_vqabs_s8 -// CHECK: vqabs.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 { +// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VQABS_V_I]] int8x8_t test_vqabs_s8(int8x8_t a) { return vqabs_s8(a); } -// CHECK-LABEL: test_vqabs_s16 -// CHECK: vqabs.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4 +// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqabs_s16(int16x4_t a) { return vqabs_s16(a); } -// CHECK-LABEL: test_vqabs_s32 -// CHECK: vqabs.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4 +// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqabs_s32(int32x2_t a) { return vqabs_s32(a); } -// CHECK-LABEL: test_vqabsq_s8 -// CHECK: vqabs.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 { +// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VQABSQ_V_I]] int8x16_t test_vqabsq_s8(int8x16_t a) { return vqabsq_s8(a); } -// CHECK-LABEL: test_vqabsq_s16 -// CHECK: vqabs.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4 +// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vqabsq_s16(int16x8_t a) { return vqabsq_s16(a); } -// CHECK-LABEL: test_vqabsq_s32 -// CHECK: vqabs.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4 +// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vqabsq_s32(int32x4_t a) { return vqabsq_s32(a); } -// CHECK-LABEL: test_vqadd_s8 -// CHECK: vqadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQADD_V_I]] int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); } -// CHECK-LABEL: test_vqadd_s16 -// CHECK: vqadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); } -// CHECK-LABEL: test_vqadd_s32 -// CHECK: vqadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); } -// CHECK-LABEL: test_vqadd_s64 -// CHECK: vqadd.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); } -// CHECK-LABEL: test_vqadd_u8 -// CHECK: vqadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQADD_V_I]] uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); } -// CHECK-LABEL: test_vqadd_u16 -// CHECK: vqadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); } -// CHECK-LABEL: test_vqadd_u32 -// CHECK: vqadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); } -// CHECK-LABEL: test_vqadd_u64 -// CHECK: vqadd.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4 +// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); } -// CHECK-LABEL: test_vqaddq_s8 -// CHECK: vqadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQADDQ_V_I]] int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { return vqaddq_s8(a, b); } -// CHECK-LABEL: test_vqaddq_s16 -// CHECK: vqadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { return vqaddq_s16(a, b); } -// CHECK-LABEL: test_vqaddq_s32 -// CHECK: vqadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { return vqaddq_s32(a, b); } -// CHECK-LABEL: test_vqaddq_s64 -// CHECK: vqadd.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); } -// CHECK-LABEL: test_vqaddq_u8 -// CHECK: vqadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQADDQ_V_I]] uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { return vqaddq_u8(a, b); } -// CHECK-LABEL: test_vqaddq_u16 -// CHECK: vqadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { return vqaddq_u16(a, b); } -// CHECK-LABEL: test_vqaddq_u32 -// CHECK: vqadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { return vqaddq_u32(a, b); } -// CHECK-LABEL: test_vqaddq_u64 -// CHECK: vqadd.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4 +// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); } -// CHECK-LABEL: test_vqdmlal_s16 -// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); } -// CHECK-LABEL: test_vqdmlal_s32 -// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); } -// CHECK-LABEL: test_vqdmlal_lane_s16 -// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vqdmlal_lane_s32 -// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vqdmlal_n_s16 -// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]] int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlal_n_s16(a, b, c); } -// CHECK-LABEL: test_vqdmlal_n_s32 -// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]] int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlal_n_s32(a, b, c); } -// CHECK-LABEL: test_vqdmlsl_s16 -// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); } -// CHECK-LABEL: test_vqdmlsl_s32 -// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); } -// CHECK-LABEL: test_vqdmlsl_lane_s16 -// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: test_vqdmlsl_lane_s32 -// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: test_vqdmlsl_n_s16 -// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4 +// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]] int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlsl_n_s16(a, b, c); } -// CHECK-LABEL: test_vqdmlsl_n_s32 -// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4 +// CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4 +// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]] int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlsl_n_s32(a, b, c); } -// CHECK-LABEL: test_vqdmulh_s16 -// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); } -// CHECK-LABEL: test_vqdmulh_s32 -// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); } -// CHECK-LABEL: test_vqdmulhq_s16 -// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { return vqdmulhq_s16(a, b); } -// CHECK-LABEL: test_vqdmulhq_s32 -// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { return vqdmulhq_s32(a, b); } -// CHECK-LABEL: test_vqdmulh_lane_s16 -// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vqdmulh_lane_s32 -// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4 +// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vqdmulhq_lane_s16 -// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vqdmulhq_lane_s32 -// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4 +// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vqdmulh_n_s16 -// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4 +// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); } -// CHECK-LABEL: test_vqdmulh_n_s32 -// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4 +// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); } -// CHECK-LABEL: test_vqdmulhq_n_s16 -// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4 +// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { return vqdmulhq_n_s16(a, b); } -// CHECK-LABEL: test_vqdmulhq_n_s32 -// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4 +// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { return vqdmulhq_n_s32(a, b); } -// CHECK-LABEL: test_vqdmull_s16 -// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); } -// CHECK-LABEL: test_vqdmull_s32 -// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); } -// CHECK-LABEL: test_vqdmull_lane_s16 -// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { return vqdmull_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vqdmull_lane_s32 -// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4 +// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { return vqdmull_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vqdmull_n_s16 -// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4 +// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { return vqdmull_n_s16(a, b); } -// CHECK-LABEL: test_vqdmull_n_s32 -// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4 +// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { return vqdmull_n_s32(a, b); } -// CHECK-LABEL: test_vqmovn_s16 -// CHECK: vqmovn.s16 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4 +// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] int8x8_t test_vqmovn_s16(int16x8_t a) { return vqmovn_s16(a); } -// CHECK-LABEL: test_vqmovn_s32 -// CHECK: vqmovn.s32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqmovn_s32(int32x4_t a) { return vqmovn_s32(a); } -// CHECK-LABEL: test_vqmovn_s64 -// CHECK: vqmovn.s64 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqmovn_s64(int64x2_t a) { return vqmovn_s64(a); } -// CHECK-LABEL: test_vqmovn_u16 -// CHECK: vqmovn.u16 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4 +// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] uint8x8_t test_vqmovn_u16(uint16x8_t a) { return vqmovn_u16(a); } -// CHECK-LABEL: test_vqmovn_u32 -// CHECK: vqmovn.u32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] uint16x4_t test_vqmovn_u32(uint32x4_t a) { return vqmovn_u32(a); } -// CHECK-LABEL: test_vqmovn_u64 -// CHECK: vqmovn.u64 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4 +// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vqmovn_u64(uint64x2_t a) { return vqmovn_u64(a); } -// CHECK-LABEL: test_vqmovun_s16 -// CHECK: vqmovun.s16 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4 +// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] uint8x8_t test_vqmovun_s16(int16x8_t a) { return vqmovun_s16(a); } -// CHECK-LABEL: test_vqmovun_s32 -// CHECK: vqmovun.s32 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4 +// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] uint16x4_t test_vqmovun_s32(int32x4_t a) { return vqmovun_s32(a); } -// CHECK-LABEL: test_vqmovun_s64 -// CHECK: vqmovun.s64 d{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4 +// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vqmovun_s64(int64x2_t a) { return vqmovun_s64(a); } -// CHECK-LABEL: test_vqneg_s8 -// CHECK: vqneg.s8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 { +// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #4 +// CHECK: ret <8 x i8> [[VQNEG_V_I]] int8x8_t test_vqneg_s8(int8x8_t a) { return vqneg_s8(a); } -// CHECK-LABEL: test_vqneg_s16 -// CHECK: vqneg.s16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4 +// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP1]] int16x4_t test_vqneg_s16(int16x4_t a) { return vqneg_s16(a); } -// CHECK-LABEL: test_vqneg_s32 -// CHECK: vqneg.s32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4 +// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vqneg_s32(int32x2_t a) { return vqneg_s32(a); } -// CHECK-LABEL: test_vqnegq_s8 -// CHECK: vqneg.s8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 { +// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #4 +// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] int8x16_t test_vqnegq_s8(int8x16_t a) { return vqnegq_s8(a); } -// CHECK-LABEL: test_vqnegq_s16 -// CHECK: vqneg.s16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4 +// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP1]] int16x8_t test_vqnegq_s16(int16x8_t a) { return vqnegq_s16(a); } -// CHECK-LABEL: test_vqnegq_s32 -// CHECK: vqneg.s32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4 +// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vqnegq_s32(int32x4_t a) { return vqnegq_s32(a); } -// CHECK-LABEL: test_vqrdmulh_s16 -// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); } -// CHECK-LABEL: test_vqrdmulh_s32 -// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); } -// CHECK-LABEL: test_vqrdmulhq_s16 -// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { return vqrdmulhq_s16(a, b); } -// CHECK-LABEL: test_vqrdmulhq_s32 -// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } -// CHECK-LABEL: test_vqrdmulh_lane_s16 -// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vqrdmulh_lane_s32 -// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4 +// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vqrdmulhq_lane_s16 -// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqrdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vqrdmulhq_lane_s32 -// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4 +// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqrdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vqrdmulh_n_s16 -// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4 +// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); } -// CHECK-LABEL: test_vqrdmulh_n_s32 -// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4 +// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); } -// CHECK-LABEL: test_vqrdmulhq_n_s16 -// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 +// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 +// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 +// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 +// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4 +// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return vqrdmulhq_n_s16(a, b); } -// CHECK-LABEL: test_vqrdmulhq_n_s32 -// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 +// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 +// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 +// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4 +// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { return vqrdmulhq_n_s32(a, b); } -// CHECK-LABEL: test_vqrshl_s8 -// CHECK: vqrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQRSHL_V_I]] int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); } -// CHECK-LABEL: test_vqrshl_s16 -// CHECK: vqrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); } -// CHECK-LABEL: test_vqrshl_s32 -// CHECK: vqrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); } -// CHECK-LABEL: test_vqrshl_s64 -// CHECK: vqrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); } -// CHECK-LABEL: test_vqrshl_u8 -// CHECK: vqrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQRSHL_V_I]] uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); } -// CHECK-LABEL: test_vqrshl_u16 -// CHECK: vqrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); } -// CHECK-LABEL: test_vqrshl_u32 -// CHECK: vqrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); } -// CHECK-LABEL: test_vqrshl_u64 -// CHECK: vqrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4 +// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); } -// CHECK-LABEL: test_vqrshlq_s8 -// CHECK: vqrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { return vqrshlq_s8(a, b); } -// CHECK-LABEL: test_vqrshlq_s16 -// CHECK: vqrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { return vqrshlq_s16(a, b); } -// CHECK-LABEL: test_vqrshlq_s32 -// CHECK: vqrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { return vqrshlq_s32(a, b); } -// CHECK-LABEL: test_vqrshlq_s64 -// CHECK: vqrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); } -// CHECK-LABEL: test_vqrshlq_u8 -// CHECK: vqrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); } -// CHECK-LABEL: test_vqrshlq_u16 -// CHECK: vqrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { return vqrshlq_u16(a, b); } -// CHECK-LABEL: test_vqrshlq_u32 -// CHECK: vqrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { return vqrshlq_u32(a, b); } -// CHECK-LABEL: test_vqrshlq_u64 -// CHECK: vqrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4 +// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); } -// CHECK-LABEL: test_vqrshrn_n_s16 -// CHECK: vqrshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQRSHRN_N]]1 int8x8_t test_vqrshrn_n_s16(int16x8_t a) { return vqrshrn_n_s16(a, 1); } -// CHECK-LABEL: test_vqrshrn_n_s32 -// CHECK: vqrshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQRSHRN_N]]1 int16x4_t test_vqrshrn_n_s32(int32x4_t a) { return vqrshrn_n_s32(a, 1); } -// CHECK-LABEL: test_vqrshrn_n_s64 -// CHECK: vqrshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQRSHRN_N]]1 int32x2_t test_vqrshrn_n_s64(int64x2_t a) { return vqrshrn_n_s64(a, 1); } -// CHECK-LABEL: test_vqrshrn_n_u16 -// CHECK: vqrshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQRSHRN_N]]1 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { return vqrshrn_n_u16(a, 1); } -// CHECK-LABEL: test_vqrshrn_n_u32 -// CHECK: vqrshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQRSHRN_N]]1 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { return vqrshrn_n_u32(a, 1); } -// CHECK-LABEL: test_vqrshrn_n_u64 -// CHECK: vqrshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQRSHRN_N]]1 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { return vqrshrn_n_u64(a, 1); } -// CHECK-LABEL: test_vqrshrun_n_s16 -// CHECK: vqrshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQRSHRUN_N]]1 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { return vqrshrun_n_s16(a, 1); } -// CHECK-LABEL: test_vqrshrun_n_s32 -// CHECK: vqrshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQRSHRUN_N]]1 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { return vqrshrun_n_s32(a, 1); } -// CHECK-LABEL: test_vqrshrun_n_s64 -// CHECK: vqrshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQRSHRUN_N]]1 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { return vqrshrun_n_s64(a, 1); } -// CHECK-LABEL: test_vqshl_s8 -// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSHL_V_I]] int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); } -// CHECK-LABEL: test_vqshl_s16 -// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); } -// CHECK-LABEL: test_vqshl_s32 -// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); } -// CHECK-LABEL: test_vqshl_s64 -// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); } -// CHECK-LABEL: test_vqshl_u8 -// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSHL_V_I]] uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); } -// CHECK-LABEL: test_vqshl_u16 -// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); } -// CHECK-LABEL: test_vqshl_u32 -// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); } -// CHECK-LABEL: test_vqshl_u64 -// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4 +// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); } -// CHECK-LABEL: test_vqshlq_s8 -// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { return vqshlq_s8(a, b); } -// CHECK-LABEL: test_vqshlq_s16 -// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { return vqshlq_s16(a, b); } -// CHECK-LABEL: test_vqshlq_s32 -// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { return vqshlq_s32(a, b); } -// CHECK-LABEL: test_vqshlq_s64 -// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); } -// CHECK-LABEL: test_vqshlq_u8 -// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { return vqshlq_u8(a, b); } -// CHECK-LABEL: test_vqshlq_u16 -// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { return vqshlq_u16(a, b); } -// CHECK-LABEL: test_vqshlq_u32 -// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { return vqshlq_u32(a, b); } -// CHECK-LABEL: test_vqshlq_u64 -// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4 +// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); } -// CHECK-LABEL: test_vqshlu_n_s8 -// CHECK: vqshlu.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VQSHLU_N]] uint8x8_t test_vqshlu_n_s8(int8x8_t a) { return vqshlu_n_s8(a, 1); } -// CHECK-LABEL: test_vqshlu_n_s16 -// CHECK: vqshlu.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VQSHLU_N]]1 uint16x4_t test_vqshlu_n_s16(int16x4_t a) { return vqshlu_n_s16(a, 1); } -// CHECK-LABEL: test_vqshlu_n_s32 -// CHECK: vqshlu.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VQSHLU_N]]1 uint32x2_t test_vqshlu_n_s32(int32x2_t a) { return vqshlu_n_s32(a, 1); } -// CHECK-LABEL: test_vqshlu_n_s64 -// CHECK: vqshlu.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHLU_N]]1 uint64x1_t test_vqshlu_n_s64(int64x1_t a) { return vqshlu_n_s64(a, 1); } -// CHECK-LABEL: test_vqshluq_n_s8 -// CHECK: vqshlu.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VQSHLU_N]] uint8x16_t test_vqshluq_n_s8(int8x16_t a) { return vqshluq_n_s8(a, 1); } -// CHECK-LABEL: test_vqshluq_n_s16 -// CHECK: vqshlu.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VQSHLU_N]]1 uint16x8_t test_vqshluq_n_s16(int16x8_t a) { return vqshluq_n_s16(a, 1); } -// CHECK-LABEL: test_vqshluq_n_s32 -// CHECK: vqshlu.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VQSHLU_N]]1 uint32x4_t test_vqshluq_n_s32(int32x4_t a) { return vqshluq_n_s32(a, 1); } -// CHECK-LABEL: test_vqshluq_n_s64 -// CHECK: vqshlu.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VQSHLU_N]]1 uint64x2_t test_vqshluq_n_s64(int64x2_t a) { return vqshluq_n_s64(a, 1); } -// CHECK-LABEL: test_vqshl_n_s8 -// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VQSHL_N]] int8x8_t test_vqshl_n_s8(int8x8_t a) { return vqshl_n_s8(a, 1); } -// CHECK-LABEL: test_vqshl_n_s16 -// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VQSHL_N]]1 int16x4_t test_vqshl_n_s16(int16x4_t a) { return vqshl_n_s16(a, 1); } -// CHECK-LABEL: test_vqshl_n_s32 -// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VQSHL_N]]1 int32x2_t test_vqshl_n_s32(int32x2_t a) { return vqshl_n_s32(a, 1); } -// CHECK-LABEL: test_vqshl_n_s64 -// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHL_N]]1 int64x1_t test_vqshl_n_s64(int64x1_t a) { return vqshl_n_s64(a, 1); } -// CHECK-LABEL: test_vqshl_n_u8 -// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VQSHL_N]] uint8x8_t test_vqshl_n_u8(uint8x8_t a) { return vqshl_n_u8(a, 1); } -// CHECK-LABEL: test_vqshl_n_u16 -// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VQSHL_N]]1 uint16x4_t test_vqshl_n_u16(uint16x4_t a) { return vqshl_n_u16(a, 1); } -// CHECK-LABEL: test_vqshl_n_u32 -// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VQSHL_N]]1 uint32x2_t test_vqshl_n_u32(uint32x2_t a) { return vqshl_n_u32(a, 1); } -// CHECK-LABEL: test_vqshl_n_u64 -// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VQSHL_N]]1 uint64x1_t test_vqshl_n_u64(uint64x1_t a) { return vqshl_n_u64(a, 1); } -// CHECK-LABEL: test_vqshlq_n_s8 -// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VQSHL_N]] int8x16_t test_vqshlq_n_s8(int8x16_t a) { return vqshlq_n_s8(a, 1); } -// CHECK-LABEL: test_vqshlq_n_s16 -// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VQSHL_N]]1 int16x8_t test_vqshlq_n_s16(int16x8_t a) { return vqshlq_n_s16(a, 1); } -// CHECK-LABEL: test_vqshlq_n_s32 -// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VQSHL_N]]1 int32x4_t test_vqshlq_n_s32(int32x4_t a) { return vqshlq_n_s32(a, 1); } -// CHECK-LABEL: test_vqshlq_n_s64 -// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VQSHL_N]]1 int64x2_t test_vqshlq_n_s64(int64x2_t a) { return vqshlq_n_s64(a, 1); } -// CHECK-LABEL: test_vqshlq_n_u8 -// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VQSHL_N]] uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { return vqshlq_n_u8(a, 1); } -// CHECK-LABEL: test_vqshlq_n_u16 -// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VQSHL_N]]1 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { return vqshlq_n_u16(a, 1); } -// CHECK-LABEL: test_vqshlq_n_u32 -// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VQSHL_N]]1 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { return vqshlq_n_u32(a, 1); } -// CHECK-LABEL: test_vqshlq_n_u64 -// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VQSHL_N]]1 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { return vqshlq_n_u64(a, 1); } -// CHECK-LABEL: test_vqshrn_n_s16 -// CHECK: vqshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQSHRN_N]]1 int8x8_t test_vqshrn_n_s16(int16x8_t a) { return vqshrn_n_s16(a, 1); } -// CHECK-LABEL: test_vqshrn_n_s32 -// CHECK: vqshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQSHRN_N]]1 int16x4_t test_vqshrn_n_s32(int32x4_t a) { return vqshrn_n_s32(a, 1); } -// CHECK-LABEL: test_vqshrn_n_s64 -// CHECK: vqshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQSHRN_N]]1 int32x2_t test_vqshrn_n_s64(int64x2_t a) { return vqshrn_n_s64(a, 1); } -// CHECK-LABEL: test_vqshrn_n_u16 -// CHECK: vqshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQSHRN_N]]1 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { return vqshrn_n_u16(a, 1); } -// CHECK-LABEL: test_vqshrn_n_u32 -// CHECK: vqshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQSHRN_N]]1 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { return vqshrn_n_u32(a, 1); } -// CHECK-LABEL: test_vqshrn_n_u64 -// CHECK: vqshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQSHRN_N]]1 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { return vqshrn_n_u64(a, 1); } -// CHECK-LABEL: test_vqshrun_n_s16 -// CHECK: vqshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 1) +// CHECK: ret <8 x i8> [[VQSHRUN_N]]1 uint8x8_t test_vqshrun_n_s16(int16x8_t a) { return vqshrun_n_s16(a, 1); } -// CHECK-LABEL: test_vqshrun_n_s32 -// CHECK: vqshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 1) +// CHECK: ret <4 x i16> [[VQSHRUN_N]]1 uint16x4_t test_vqshrun_n_s32(int32x4_t a) { return vqshrun_n_s32(a, 1); } -// CHECK-LABEL: test_vqshrun_n_s64 -// CHECK: vqshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 1) +// CHECK: ret <2 x i32> [[VQSHRUN_N]]1 uint32x2_t test_vqshrun_n_s64(int64x2_t a) { return vqshrun_n_s64(a, 1); } -// CHECK-LABEL: test_vqsub_s8 -// CHECK: vqsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSUB_V_I]] int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); } -// CHECK-LABEL: test_vqsub_s16 -// CHECK: vqsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); } -// CHECK-LABEL: test_vqsub_s32 -// CHECK: vqsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); } -// CHECK-LABEL: test_vqsub_s64 -// CHECK: vqsub.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); } -// CHECK-LABEL: test_vqsub_u8 -// CHECK: vqsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VQSUB_V_I]] uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); } -// CHECK-LABEL: test_vqsub_u16 -// CHECK: vqsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); } -// CHECK-LABEL: test_vqsub_u32 -// CHECK: vqsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); } -// CHECK-LABEL: test_vqsub_u64 -// CHECK: vqsub.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4 +// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); } -// CHECK-LABEL: test_vqsubq_s8 -// CHECK: vqsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { return vqsubq_s8(a, b); } -// CHECK-LABEL: test_vqsubq_s16 -// CHECK: vqsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { return vqsubq_s16(a, b); } -// CHECK-LABEL: test_vqsubq_s32 -// CHECK: vqsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { return vqsubq_s32(a, b); } -// CHECK-LABEL: test_vqsubq_s64 -// CHECK: vqsub.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); } -// CHECK-LABEL: test_vqsubq_u8 -// CHECK: vqsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { return vqsubq_u8(a, b); } -// CHECK-LABEL: test_vqsubq_u16 -// CHECK: vqsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { return vqsubq_u16(a, b); } -// CHECK-LABEL: test_vqsubq_u32 -// CHECK: vqsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { return vqsubq_u32(a, b); } -// CHECK-LABEL: test_vqsubq_u64 -// CHECK: vqsub.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4 +// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: test_vraddhn_s16 -// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); } -// CHECK-LABEL: test_vraddhn_s32 -// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); } -// CHECK-LABEL: test_vraddhn_s64 -// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); } -// CHECK-LABEL: test_vraddhn_u16 -// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); } -// CHECK-LABEL: test_vraddhn_u32 -// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); } -// CHECK-LABEL: test_vraddhn_u64 -// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4 +// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); } -// CHECK-LABEL: test_vrecpe_f32 -// CHECK: vrecpe.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4 +// CHECK: ret <2 x float> [[VRECPE_V1_I]] float32x2_t test_vrecpe_f32(float32x2_t a) { return vrecpe_f32(a); } -// CHECK-LABEL: test_vrecpe_u32 -// CHECK: vrecpe.u32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4 +// CHECK: ret <2 x i32> [[VRECPE_V1_I]] uint32x2_t test_vrecpe_u32(uint32x2_t a) { return vrecpe_u32(a); } -// CHECK-LABEL: test_vrecpeq_f32 -// CHECK: vrecpe.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4 +// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] float32x4_t test_vrecpeq_f32(float32x4_t a) { return vrecpeq_f32(a); } -// CHECK-LABEL: test_vrecpeq_u32 -// CHECK: vrecpe.u32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4 +// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] uint32x4_t test_vrecpeq_u32(uint32x4_t a) { return vrecpeq_u32(a); } -// CHECK-LABEL: test_vrecps_f32 -// CHECK: vrecps.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4 +// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) { return vrecps_f32(a, b); } -// CHECK-LABEL: test_vrecpsq_f32 -// CHECK: vrecps.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4 +// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) { return vrecpsq_f32(a, b); } -// CHECK-LABEL: test_vreinterpret_s8_s16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: test_vreinterpret_s8_s32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: test_vreinterpret_s8_s64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: test_vreinterpret_s8_u8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: test_vreinterpret_s8_u16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: test_vreinterpret_s8_u32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: test_vreinterpret_s8_u64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: test_vreinterpret_s8_f16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: test_vreinterpret_s8_f32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: test_vreinterpret_s8_p8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: test_vreinterpret_s8_p16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: test_vreinterpret_s16_s8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: test_vreinterpret_s16_s32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: test_vreinterpret_s16_s64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: test_vreinterpret_s16_u8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: test_vreinterpret_s16_u16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: test_vreinterpret_s16_u32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: test_vreinterpret_s16_u64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: test_vreinterpret_s16_f16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: test_vreinterpret_s16_f32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: test_vreinterpret_s16_p8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: test_vreinterpret_s16_p16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: test_vreinterpret_s32_s8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: test_vreinterpret_s32_s16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: test_vreinterpret_s32_s64 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: test_vreinterpret_s32_u8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: test_vreinterpret_s32_u16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: test_vreinterpret_s32_u32 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 { +// CHECK: ret <2 x i32> %a int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: test_vreinterpret_s32_u64 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: test_vreinterpret_s32_f16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: test_vreinterpret_s32_f32 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: test_vreinterpret_s32_p8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: test_vreinterpret_s32_p16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: test_vreinterpret_s64_s8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: test_vreinterpret_s64_s16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: test_vreinterpret_s64_s32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: test_vreinterpret_s64_u8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: test_vreinterpret_s64_u16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: test_vreinterpret_s64_u32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: test_vreinterpret_s64_u64 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: test_vreinterpret_s64_f16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: test_vreinterpret_s64_f32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: test_vreinterpret_s64_p8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: test_vreinterpret_s64_p16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: test_vreinterpret_u8_s8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: test_vreinterpret_u8_s16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: test_vreinterpret_u8_s32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: test_vreinterpret_u8_s64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: test_vreinterpret_u8_u16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: test_vreinterpret_u8_u32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: test_vreinterpret_u8_u64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: test_vreinterpret_u8_f16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: test_vreinterpret_u8_f32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: test_vreinterpret_u8_p8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: test_vreinterpret_u8_p16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: test_vreinterpret_u16_s8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: test_vreinterpret_u16_s16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: test_vreinterpret_u16_s32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: test_vreinterpret_u16_s64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: test_vreinterpret_u16_u8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: test_vreinterpret_u16_u32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: test_vreinterpret_u16_u64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: test_vreinterpret_u16_f16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: test_vreinterpret_u16_f32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: test_vreinterpret_u16_p8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: test_vreinterpret_u16_p16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: test_vreinterpret_u32_s8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: test_vreinterpret_u32_s16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: test_vreinterpret_u32_s32 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 { +// CHECK: ret <2 x i32> %a uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: test_vreinterpret_u32_s64 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: test_vreinterpret_u32_u8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: test_vreinterpret_u32_u16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: test_vreinterpret_u32_u64 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: test_vreinterpret_u32_f16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: test_vreinterpret_u32_f32 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: test_vreinterpret_u32_p8 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: test_vreinterpret_u32_p16 +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> +// CHECK: ret <2 x i32> [[TMP0]] uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: test_vreinterpret_u64_s8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: test_vreinterpret_u64_s16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: test_vreinterpret_u64_s32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: test_vreinterpret_u64_s64 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 { +// CHECK: ret <1 x i64> %a uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: test_vreinterpret_u64_u8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: test_vreinterpret_u64_u16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: test_vreinterpret_u64_u32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: test_vreinterpret_u64_f16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: test_vreinterpret_u64_f32 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: test_vreinterpret_u64_p8 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: test_vreinterpret_u64_p16 +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> +// CHECK: ret <1 x i64> [[TMP0]] uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: test_vreinterpret_f16_s8 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: test_vreinterpret_f16_s16 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: test_vreinterpret_f16_s32 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: test_vreinterpret_f16_s64 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: test_vreinterpret_f16_u8 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: test_vreinterpret_f16_u16 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: test_vreinterpret_f16_u32 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: test_vreinterpret_f16_u64 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: test_vreinterpret_f16_f32 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: test_vreinterpret_f16_p8 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: test_vreinterpret_f16_p16 +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> +// CHECK: ret <4 x half> [[TMP0]] float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: test_vreinterpret_f32_s8 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: test_vreinterpret_f32_s16 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: test_vreinterpret_f32_s32 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: test_vreinterpret_f32_s64 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: test_vreinterpret_f32_u8 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: test_vreinterpret_f32_u16 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: test_vreinterpret_f32_u32 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: test_vreinterpret_f32_u64 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: test_vreinterpret_f32_f16 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: test_vreinterpret_f32_p8 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: test_vreinterpret_f32_p16 +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> +// CHECK: ret <2 x float> [[TMP0]] float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: test_vreinterpret_p8_s8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: test_vreinterpret_p8_s16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: test_vreinterpret_p8_s32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: test_vreinterpret_p8_s64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: test_vreinterpret_p8_u8 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 { +// CHECK: ret <8 x i8> %a poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: test_vreinterpret_p8_u16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: test_vreinterpret_p8_u32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: test_vreinterpret_p8_u64 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: test_vreinterpret_p8_f16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: test_vreinterpret_p8_f32 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: test_vreinterpret_p8_p16 +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: ret <8 x i8> [[TMP0]] poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: test_vreinterpret_p16_s8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: test_vreinterpret_p16_s16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: test_vreinterpret_p16_s32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: test_vreinterpret_p16_s64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: test_vreinterpret_p16_u8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: test_vreinterpret_p16_u16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 { +// CHECK: ret <4 x i16> %a poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: test_vreinterpret_p16_u32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: test_vreinterpret_p16_u64 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: test_vreinterpret_p16_f16 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: test_vreinterpret_p16_f32 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: test_vreinterpret_p16_p8 +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> +// CHECK: ret <4 x i16> [[TMP0]] poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_s64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_u64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s8_f16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s8_f32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s8_p16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_s64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_u64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s16_f16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s16_f32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s16_p16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_s64 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u32 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 { +// CHECK: ret <4 x i32> %a int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s32_u64 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s32_f16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s32_f32 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s32_p16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_s32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_u64 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: test_vreinterpretq_s64_f16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_s64_f32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_s64_p16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_s64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_u64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u8_f16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u8_f32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u8_p16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_s64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_u64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u16_f16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u16_f32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u16_p16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s32 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 { +// CHECK: ret <4 x i32> %a uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u32_s64 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_u64 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_u32_f16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u32_f32 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p8 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u32_p16 +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> +// CHECK: ret <4 x i32> [[TMP0]] uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_s64 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 { +// CHECK: ret <2 x i64> %a uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_u32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_f16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: test_vreinterpretq_u64_f32 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p8 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: test_vreinterpretq_u64_p16 +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> +// CHECK: ret <2 x i64> [[TMP0]] uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s8 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s16 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s32 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_s64 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u8 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u16 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u32 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_u64 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_f16_f32 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_f16_p8 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: test_vreinterpretq_f16_p16 +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> +// CHECK: ret <8 x half> [[TMP0]] float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s8 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s16 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s32 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: test_vreinterpretq_f32_s64 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u8 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u16 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u32 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: test_vreinterpretq_f32_u64 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: test_vreinterpretq_f32_f16 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p8 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: test_vreinterpretq_f32_p16 +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> +// CHECK: ret <4 x float> [[TMP0]] float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_s64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u8 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 { +// CHECK: ret <16 x i8> %a poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_u64 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p8_f16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: test_vreinterpretq_p8_f32 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p8_p16 +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: ret <16 x i8> [[TMP0]] poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_s64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 { +// CHECK: ret <8 x i16> %a poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_u64 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: test_vreinterpretq_p16_f16 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: test_vreinterpretq_p16_f32 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: test_vreinterpretq_p16_p8 +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> +// CHECK: ret <8 x i16> [[TMP0]] poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: test_vrev16_s8 -// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: test_vrev16_u8 -// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: test_vrev16_p8 -// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: test_vrev16q_s8 -// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: test_vrev16q_u8 -// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: test_vrev16q_p8 -// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: test_vrev32_s8 -// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: test_vrev32_s16 -// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: test_vrev32_u8 -// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: test_vrev32_u16 -// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: test_vrev32_p8 -// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: test_vrev32_p16 -// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: test_vrev32q_s8 -// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: test_vrev32q_s16 -// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: test_vrev32q_u8 -// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: test_vrev32q_u16 -// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: test_vrev32q_p8 -// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: test_vrev32q_p16 -// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: test_vrev64_s8 -// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: test_vrev64_s16 -// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: test_vrev64_s32 -// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: test_vrev64_u8 -// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: test_vrev64_u16 -// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: test_vrev64_u32 -// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> +// CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: test_vrev64_p8 -// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> +// CHECK: ret <8 x i8> [[SHUFFLE_I]] poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: test_vrev64_p16 -// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> +// CHECK: ret <4 x i16> [[SHUFFLE_I]] poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: test_vrev64_f32 -// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> +// CHECK: ret <2 x float> [[SHUFFLE_I]] float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: test_vrev64q_s8 -// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: test_vrev64q_s16 -// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: test_vrev64q_s32 -// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: test_vrev64q_u8 -// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: test_vrev64q_u16 -// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: test_vrev64q_u32 -// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> +// CHECK: ret <4 x i32> [[SHUFFLE_I]] uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: test_vrev64q_p8 -// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> +// CHECK: ret <16 x i8> [[SHUFFLE_I]] poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: test_vrev64q_p16 -// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> +// CHECK: ret <8 x i16> [[SHUFFLE_I]] poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: test_vrev64q_f32 -// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 { +// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> +// CHECK: ret <4 x float> [[SHUFFLE_I]] float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } -// CHECK-LABEL: test_vrhadd_s8 -// CHECK: vrhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRHADD_V_I]] int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) { return vrhadd_s8(a, b); } -// CHECK-LABEL: test_vrhadd_s16 -// CHECK: vrhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) { return vrhadd_s16(a, b); } -// CHECK-LABEL: test_vrhadd_s32 -// CHECK: vrhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) { return vrhadd_s32(a, b); } -// CHECK-LABEL: test_vrhadd_u8 -// CHECK: vrhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRHADD_V_I]] uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) { return vrhadd_u8(a, b); } -// CHECK-LABEL: test_vrhadd_u16 -// CHECK: vrhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) { return vrhadd_u16(a, b); } -// CHECK-LABEL: test_vrhadd_u32 -// CHECK: vrhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4 +// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) { return vrhadd_u32(a, b); } -// CHECK-LABEL: test_vrhaddq_s8 -// CHECK: vrhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) { return vrhaddq_s8(a, b); } -// CHECK-LABEL: test_vrhaddq_s16 -// CHECK: vrhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) { return vrhaddq_s16(a, b); } -// CHECK-LABEL: test_vrhaddq_s32 -// CHECK: vrhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) { return vrhaddq_s32(a, b); } -// CHECK-LABEL: test_vrhaddq_u8 -// CHECK: vrhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) { return vrhaddq_u8(a, b); } -// CHECK-LABEL: test_vrhaddq_u16 -// CHECK: vrhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) { return vrhaddq_u16(a, b); } -// CHECK-LABEL: test_vrhaddq_u32 -// CHECK: vrhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4 +// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) { return vrhaddq_u32(a, b); } -// CHECK-LABEL: test_vrshl_s8 -// CHECK: vrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRSHL_V_I]] int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); } -// CHECK-LABEL: test_vrshl_s16 -// CHECK: vrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); } -// CHECK-LABEL: test_vrshl_s32 -// CHECK: vrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); } -// CHECK-LABEL: test_vrshl_s64 -// CHECK: vrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); } -// CHECK-LABEL: test_vrshl_u8 -// CHECK: vrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VRSHL_V_I]] uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); } -// CHECK-LABEL: test_vrshl_u16 -// CHECK: vrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); } -// CHECK-LABEL: test_vrshl_u32 -// CHECK: vrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); } -// CHECK-LABEL: test_vrshl_u64 -// CHECK: vrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4 +// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); } -// CHECK-LABEL: test_vrshlq_s8 -// CHECK: vrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { return vrshlq_s8(a, b); } -// CHECK-LABEL: test_vrshlq_s16 -// CHECK: vrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { return vrshlq_s16(a, b); } -// CHECK-LABEL: test_vrshlq_s32 -// CHECK: vrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { return vrshlq_s32(a, b); } -// CHECK-LABEL: test_vrshlq_s64 -// CHECK: vrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); } -// CHECK-LABEL: test_vrshlq_u8 -// CHECK: vrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { return vrshlq_u8(a, b); } -// CHECK-LABEL: test_vrshlq_u16 -// CHECK: vrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { return vrshlq_u16(a, b); } -// CHECK-LABEL: test_vrshlq_u32 -// CHECK: vrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { return vrshlq_u32(a, b); } -// CHECK-LABEL: test_vrshlq_u64 -// CHECK: vrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4 +// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); } -// CHECK-LABEL: test_vrshrn_n_s16 -// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VRSHRN_N]]1 int8x8_t test_vrshrn_n_s16(int16x8_t a) { return vrshrn_n_s16(a, 1); } -// CHECK-LABEL: test_vrshrn_n_s32 -// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VRSHRN_N]]1 int16x4_t test_vrshrn_n_s32(int32x4_t a) { return vrshrn_n_s32(a, 1); } -// CHECK-LABEL: test_vrshrn_n_s64 -// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VRSHRN_N]]1 int32x2_t test_vrshrn_n_s64(int64x2_t a) { return vrshrn_n_s64(a, 1); } -// CHECK-LABEL: test_vrshrn_n_u16 -// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 1) +// CHECK: ret <8 x i8> [[VRSHRN_N]]1 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { return vrshrn_n_u16(a, 1); } -// CHECK-LABEL: test_vrshrn_n_u32 -// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 1) +// CHECK: ret <4 x i16> [[VRSHRN_N]]1 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { return vrshrn_n_u32(a, 1); } -// CHECK-LABEL: test_vrshrn_n_u64 -// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 1) +// CHECK: ret <2 x i32> [[VRSHRN_N]]1 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { return vrshrn_n_u64(a, 1); } -// CHECK-LABEL: test_vrshr_n_s8 -// CHECK: vrshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VRSHR_N]] int8x8_t test_vrshr_n_s8(int8x8_t a) { return vrshr_n_s8(a, 1); } -// CHECK-LABEL: test_vrshr_n_s16 -// CHECK: vrshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VRSHR_N]]1 int16x4_t test_vrshr_n_s16(int16x4_t a) { return vrshr_n_s16(a, 1); } -// CHECK-LABEL: test_vrshr_n_s32 -// CHECK: vrshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VRSHR_N]]1 int32x2_t test_vrshr_n_s32(int32x2_t a) { return vrshr_n_s32(a, 1); } -// CHECK-LABEL: test_vrshr_n_s64 -// CHECK: vrshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VRSHR_N]]1 int64x1_t test_vrshr_n_s64(int64x1_t a) { return vrshr_n_s64(a, 1); } -// CHECK-LABEL: test_vrshr_n_u8 -// CHECK: vrshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> ) +// CHECK: ret <8 x i8> [[VRSHR_N]] uint8x8_t test_vrshr_n_u8(uint8x8_t a) { return vrshr_n_u8(a, 1); } -// CHECK-LABEL: test_vrshr_n_u16 -// CHECK: vrshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: ret <4 x i16> [[VRSHR_N]]1 uint16x4_t test_vrshr_n_u16(uint16x4_t a) { return vrshr_n_u16(a, 1); } -// CHECK-LABEL: test_vrshr_n_u32 -// CHECK: vrshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: ret <2 x i32> [[VRSHR_N]]1 uint32x2_t test_vrshr_n_u32(uint32x2_t a) { return vrshr_n_u32(a, 1); } -// CHECK-LABEL: test_vrshr_n_u64 -// CHECK: vrshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: ret <1 x i64> [[VRSHR_N]]1 uint64x1_t test_vrshr_n_u64(uint64x1_t a) { return vrshr_n_u64(a, 1); } -// CHECK-LABEL: test_vrshrq_n_s8 -// CHECK: vrshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VRSHR_N]] int8x16_t test_vrshrq_n_s8(int8x16_t a) { return vrshrq_n_s8(a, 1); } -// CHECK-LABEL: test_vrshrq_n_s16 -// CHECK: vrshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VRSHR_N]]1 int16x8_t test_vrshrq_n_s16(int16x8_t a) { return vrshrq_n_s16(a, 1); } -// CHECK-LABEL: test_vrshrq_n_s32 -// CHECK: vrshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VRSHR_N]]1 int32x4_t test_vrshrq_n_s32(int32x4_t a) { return vrshrq_n_s32(a, 1); } -// CHECK-LABEL: test_vrshrq_n_s64 -// CHECK: vrshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VRSHR_N]]1 int64x2_t test_vrshrq_n_s64(int64x2_t a) { return vrshrq_n_s64(a, 1); } -// CHECK-LABEL: test_vrshrq_n_u8 -// CHECK: vrshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> ) +// CHECK: ret <16 x i8> [[VRSHR_N]] uint8x16_t test_vrshrq_n_u8(uint8x16_t a) { return vrshrq_n_u8(a, 1); } -// CHECK-LABEL: test_vrshrq_n_u16 -// CHECK: vrshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: ret <8 x i16> [[VRSHR_N]]1 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) { return vrshrq_n_u16(a, 1); } -// CHECK-LABEL: test_vrshrq_n_u32 -// CHECK: vrshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: ret <4 x i32> [[VRSHR_N]]1 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) { return vrshrq_n_u32(a, 1); } -// CHECK-LABEL: test_vrshrq_n_u64 -// CHECK: vrshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: ret <2 x i64> [[VRSHR_N]]1 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) { return vrshrq_n_u64(a, 1); } -// CHECK-LABEL: test_vrsqrte_f32 -// CHECK: vrsqrte.f32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4 +// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] float32x2_t test_vrsqrte_f32(float32x2_t a) { return vrsqrte_f32(a); } -// CHECK-LABEL: test_vrsqrte_u32 -// CHECK: vrsqrte.u32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4 +// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] uint32x2_t test_vrsqrte_u32(uint32x2_t a) { return vrsqrte_u32(a); } -// CHECK-LABEL: test_vrsqrteq_f32 -// CHECK: vrsqrte.f32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4 +// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] float32x4_t test_vrsqrteq_f32(float32x4_t a) { return vrsqrteq_f32(a); } -// CHECK-LABEL: test_vrsqrteq_u32 -// CHECK: vrsqrte.u32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4 +// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { return vrsqrteq_u32(a); } -// CHECK-LABEL: test_vrsqrts_f32 -// CHECK: vrsqrts.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4 +// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float> +// CHECK: ret <2 x float> [[TMP2]] float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) { return vrsqrts_f32(a, b); } -// CHECK-LABEL: test_vrsqrtsq_f32 -// CHECK: vrsqrts.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4 +// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float> +// CHECK: ret <4 x float> [[TMP2]] float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) { return vrsqrtsq_f32(a, b); } -// CHECK-LABEL: test_vrsra_n_s8 -// CHECK: vrsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> ) +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_s16 -// CHECK: vrsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i16> [[TMP3]] int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_s32 -// CHECK: vrsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i32> [[TMP3]] int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_s64 -// CHECK: vrsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <1 x i64> [[TMP3]] int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_u8 -// CHECK: vrsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> ) +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_u16 -// CHECK: vrsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i16> [[TMP3]] uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_u32 -// CHECK: vrsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i32> [[TMP3]] uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, 1); } -// CHECK-LABEL: test_vrsra_n_u64 -// CHECK: vrsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <1 x i64> [[TMP3]] uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_s8 -// CHECK: vrsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> ) +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_s16 -// CHECK: vrsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <8 x i16> [[TMP3]] int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_s32 -// CHECK: vrsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i32> [[TMP3]] int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_s64 -// CHECK: vrsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i64> [[TMP3]] int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_u8 -// CHECK: vrsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> ) +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_u16 -// CHECK: vrsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <8 x i16> [[TMP3]] uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_u32 -// CHECK: vrsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <4 x i32> [[TMP3]] uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, 1); } -// CHECK-LABEL: test_vrsraq_n_u64 -// CHECK: vrsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> ) +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N]]1 +// CHECK: ret <2 x i64> [[TMP3]] uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, 1); } -// CHECK-LABEL: test_vrsubhn_s16 -// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); } -// CHECK-LABEL: test_vrsubhn_s32 -// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); } -// CHECK-LABEL: test_vrsubhn_s64 -// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); } -// CHECK-LABEL: test_vrsubhn_u16 -// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4 +// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); } -// CHECK-LABEL: test_vrsubhn_u32 -// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); } -// CHECK-LABEL: test_vrsubhn_u64 -// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4 +// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); } -// CHECK-LABEL: test_vset_lane_u8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { return vset_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vset_lane_u16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { return vset_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vset_lane_u32 -// CHECK: mov +// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VSET_LANE]] uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { return vset_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vset_lane_s8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) { return vset_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vset_lane_s16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) { return vset_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vset_lane_s32 -// CHECK: mov +// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1 +// CHECK: ret <2 x i32> [[VSET_LANE]] int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) { return vset_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vset_lane_p8 -// CHECK: vmov +// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 +// CHECK: ret <8 x i8> [[VSET_LANE]] poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) { return vset_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vset_lane_p16 -// CHECK: vmov +// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3 +// CHECK: ret <4 x i16> [[VSET_LANE]] poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) { return vset_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vset_lane_f32 -// CHECK: mov +// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1 +// CHECK: ret <2 x float> [[VSET_LANE]] float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { return vset_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vset_lane_f16 -// CHECK: mov +// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[__REINT_246:%.*]] = alloca half, align 2 +// CHECK: [[__REINT1_246:%.*]] = alloca <4 x half>, align 8 +// CHECK: [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8 +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: store half [[TMP0]], half* [[__REINT_246]], align 2 +// CHECK: store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8 +// CHECK: [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8 +// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1 +// CHECK: store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8 +// CHECK: [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>* +// CHECK: [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8 +// CHECK: ret <4 x half> [[TMP8]] float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) { return vset_lane_f16(*a, b, 1); } -// CHECK-LABEL: test_vsetq_lane_u8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { return vsetq_lane_u8(a, b, 15); } -// CHECK-LABEL: test_vsetq_lane_u16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { return vsetq_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vsetq_lane_u32 -// CHECK: vmov +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { return vsetq_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vsetq_lane_s8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) { return vsetq_lane_s8(a, b, 15); } -// CHECK-LABEL: test_vsetq_lane_s16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) { return vsetq_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vsetq_lane_s32 -// CHECK: vmov +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3 +// CHECK: ret <4 x i32> [[VSET_LANE]] int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) { return vsetq_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vsetq_lane_p8 -// CHECK: vmov +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 { +// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 +// CHECK: ret <16 x i8> [[VSET_LANE]] poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) { return vsetq_lane_p8(a, b, 15); } -// CHECK-LABEL: test_vsetq_lane_p16 -// CHECK: vmov +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7 +// CHECK: ret <8 x i16> [[VSET_LANE]] poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) { return vsetq_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vsetq_lane_f32 -// CHECK: vmov +// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3 +// CHECK: ret <4 x float> [[VSET_LANE]] float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { return vsetq_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vsetq_lane_f16 -// CHECK: vmov +// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[__REINT_248:%.*]] = alloca half, align 2 +// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16 +// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16 +// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2 +// CHECK: store half [[TMP0]], half* [[__REINT_248]], align 2 +// CHECK: store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16 +// CHECK: [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16* +// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16 +// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3 +// CHECK: store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16 +// CHECK: [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>* +// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16 +// CHECK: ret <8 x half> [[TMP8]] float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) { return vsetq_lane_f16(*a, b, 3); } -// CHECK-LABEL: test_vset_lane_s64 // The optimizer is able to get rid of all moves now. +// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) { return vset_lane_s64(a, b, 0); } -// CHECK-LABEL: test_vset_lane_u64 // The optimizer is able to get rid of all moves now. +// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0 +// CHECK: ret <1 x i64> [[VSET_LANE]] uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) { return vset_lane_u64(a, b, 0); } -// CHECK-LABEL: test_vsetq_lane_s64 -// CHECK: vmov +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { return vsetq_lane_s64(a, b, 1); } -// CHECK-LABEL: test_vsetq_lane_u64 -// CHECK: vmov +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1 +// CHECK: ret <2 x i64> [[VSET_LANE]] uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) { return vsetq_lane_u64(a, b, 1); } -// CHECK-LABEL: test_vshl_s8 -// CHECK: vshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VSHL_V_I]] int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); } -// CHECK-LABEL: test_vshl_s16 -// CHECK: vshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); } -// CHECK-LABEL: test_vshl_s32 -// CHECK: vshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); } -// CHECK-LABEL: test_vshl_s64 -// CHECK: vshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); } -// CHECK-LABEL: test_vshl_u8 -// CHECK: vshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VSHL_V_I]] uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); } -// CHECK-LABEL: test_vshl_u16 -// CHECK: vshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[TMP2]] uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); } -// CHECK-LABEL: test_vshl_u32 -// CHECK: vshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[TMP2]] uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); } -// CHECK-LABEL: test_vshl_u64 -// CHECK: vshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4 +// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64> +// CHECK: ret <1 x i64> [[TMP2]] uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); } -// CHECK-LABEL: test_vshlq_s8 -// CHECK: vshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VSHLQ_V_I]] int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { return vshlq_s8(a, b); } -// CHECK-LABEL: test_vshlq_s16 -// CHECK: vshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { return vshlq_s16(a, b); } -// CHECK-LABEL: test_vshlq_s32 -// CHECK: vshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { return vshlq_s32(a, b); } -// CHECK-LABEL: test_vshlq_s64 -// CHECK: vshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); } -// CHECK-LABEL: test_vshlq_u8 -// CHECK: vshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4 +// CHECK: ret <16 x i8> [[VSHLQ_V_I]] uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { return vshlq_u8(a, b); } -// CHECK-LABEL: test_vshlq_u16 -// CHECK: vshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK: ret <8 x i16> [[TMP2]] uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { return vshlq_u16(a, b); } -// CHECK-LABEL: test_vshlq_u32 -// CHECK: vshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK: ret <4 x i32> [[TMP2]] uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { return vshlq_u32(a, b); } -// CHECK-LABEL: test_vshlq_u64 -// CHECK: vshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4 +// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK: ret <2 x i64> [[TMP2]] uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); } -// CHECK-LABEL: test_vshll_n_s8 -// CHECK: vshll.s8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 1); } -// CHECK-LABEL: test_vshll_n_s16 -// CHECK: vshll.s16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 1); } -// CHECK-LABEL: test_vshll_n_s32 -// CHECK: vshll.s32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 1); } -// CHECK-LABEL: test_vshll_n_u8 -// CHECK: vshll.u8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 { +// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], +// CHECK: ret <8 x i16> [[VSHLL_N]] uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 1); } -// CHECK-LABEL: test_vshll_n_u16 -// CHECK: vshll.u16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], +// CHECK: ret <4 x i32> [[VSHLL_N]] uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 1); } -// CHECK-LABEL: test_vshll_n_u32 -// CHECK: vshll.u32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], +// CHECK: ret <2 x i64> [[VSHLL_N]] uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 1); } -// CHECK-LABEL: test_vshl_n_s8 -// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHL_N]] int8x8_t test_vshl_n_s8(int8x8_t a) { return vshl_n_s8(a, 1); } -// CHECK-LABEL: test_vshl_n_s16 -// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHL_N]] int16x4_t test_vshl_n_s16(int16x4_t a) { return vshl_n_s16(a, 1); } -// CHECK-LABEL: test_vshl_n_s32 -// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHL_N]] int32x2_t test_vshl_n_s32(int32x2_t a) { return vshl_n_s32(a, 1); } -// CHECK-LABEL: test_vshl_n_s64 -// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHL_N]] int64x1_t test_vshl_n_s64(int64x1_t a) { return vshl_n_s64(a, 1); } -// CHECK-LABEL: test_vshl_n_u8 -// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHL_N]] uint8x8_t test_vshl_n_u8(uint8x8_t a) { return vshl_n_u8(a, 1); } -// CHECK-LABEL: test_vshl_n_u16 -// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHL_N]] uint16x4_t test_vshl_n_u16(uint16x4_t a) { return vshl_n_u16(a, 1); } -// CHECK-LABEL: test_vshl_n_u32 -// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHL_N]] uint32x2_t test_vshl_n_u32(uint32x2_t a) { return vshl_n_u32(a, 1); } -// CHECK-LABEL: test_vshl_n_u64 -// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHL_N]] uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } -// CHECK-LABEL: test_vshlq_n_s8 -// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHL_N]] int8x16_t test_vshlq_n_s8(int8x16_t a) { return vshlq_n_s8(a, 1); } -// CHECK-LABEL: test_vshlq_n_s16 -// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHL_N]] int16x8_t test_vshlq_n_s16(int16x8_t a) { return vshlq_n_s16(a, 1); } -// CHECK-LABEL: test_vshlq_n_s32 -// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHL_N]] int32x4_t test_vshlq_n_s32(int32x4_t a) { return vshlq_n_s32(a, 1); } -// CHECK-LABEL: test_vshlq_n_s64 -// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHL_N]] int64x2_t test_vshlq_n_s64(int64x2_t a) { return vshlq_n_s64(a, 1); } -// CHECK-LABEL: test_vshlq_n_u8 -// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHL_N]] uint8x16_t test_vshlq_n_u8(uint8x16_t a) { return vshlq_n_u8(a, 1); } -// CHECK-LABEL: test_vshlq_n_u16 -// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHL_N]] uint16x8_t test_vshlq_n_u16(uint16x8_t a) { return vshlq_n_u16(a, 1); } -// CHECK-LABEL: test_vshlq_n_u32 -// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHL_N]] uint32x4_t test_vshlq_n_u32(uint32x4_t a) { return vshlq_n_u32(a, 1); } -// CHECK-LABEL: test_vshlq_n_u64 -// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHL_N]] uint64x2_t test_vshlq_n_u64(uint64x2_t a) { return vshlq_n_u64(a, 1); } -// CHECK-LABEL: test_vshrn_n_s16 -// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSHRN_N]] int8x8_t test_vshrn_n_s16(int16x8_t a) { return vshrn_n_s16(a, 1); } -// CHECK-LABEL: test_vshrn_n_s32 -// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSHRN_N]] int16x4_t test_vshrn_n_s32(int32x4_t a) { return vshrn_n_s32(a, 1); } -// CHECK-LABEL: test_vshrn_n_s64 -// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSHRN_N]] int32x2_t test_vshrn_n_s64(int64x2_t a) { return vshrn_n_s64(a, 1); } -// CHECK-LABEL: test_vshrn_n_u16 -// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSHRN_N]] uint8x8_t test_vshrn_n_u16(uint16x8_t a) { return vshrn_n_u16(a, 1); } -// CHECK-LABEL: test_vshrn_n_u32 -// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSHRN_N]] uint16x4_t test_vshrn_n_u32(uint32x4_t a) { return vshrn_n_u32(a, 1); } -// CHECK-LABEL: test_vshrn_n_u64 -// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], +// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSHRN_N]] uint32x2_t test_vshrn_n_u64(uint64x2_t a) { return vshrn_n_u64(a, 1); } -// CHECK-LABEL: test_vshr_n_s8 -// CHECK: vshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHR_N]] int8x8_t test_vshr_n_s8(int8x8_t a) { return vshr_n_s8(a, 1); } -// CHECK-LABEL: test_vshr_n_s16 -// CHECK: vshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHR_N]] int16x4_t test_vshr_n_s16(int16x4_t a) { return vshr_n_s16(a, 1); } -// CHECK-LABEL: test_vshr_n_s32 -// CHECK: vshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHR_N]] int32x2_t test_vshr_n_s32(int32x2_t a) { return vshr_n_s32(a, 1); } -// CHECK-LABEL: test_vshr_n_s64 -// CHECK: vshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHR_N]] int64x1_t test_vshr_n_s64(int64x1_t a) { return vshr_n_s64(a, 1); } -// CHECK-LABEL: test_vshr_n_u8 -// CHECK: vshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, +// CHECK: ret <8 x i8> [[VSHR_N]] uint8x8_t test_vshr_n_u8(uint8x8_t a) { return vshr_n_u8(a, 1); } -// CHECK-LABEL: test_vshr_n_u16 -// CHECK: vshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], +// CHECK: ret <4 x i16> [[VSHR_N]] uint16x4_t test_vshr_n_u16(uint16x4_t a) { return vshr_n_u16(a, 1); } -// CHECK-LABEL: test_vshr_n_u32 -// CHECK: vshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], +// CHECK: ret <2 x i32> [[VSHR_N]] uint32x2_t test_vshr_n_u32(uint32x2_t a) { return vshr_n_u32(a, 1); } -// CHECK-LABEL: test_vshr_n_u64 -// CHECK: vshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], +// CHECK: ret <1 x i64> [[VSHR_N]] uint64x1_t test_vshr_n_u64(uint64x1_t a) { return vshr_n_u64(a, 1); } -// CHECK-LABEL: test_vshrq_n_s8 -// CHECK: vshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHR_N]] int8x16_t test_vshrq_n_s8(int8x16_t a) { return vshrq_n_s8(a, 1); } -// CHECK-LABEL: test_vshrq_n_s16 -// CHECK: vshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHR_N]] int16x8_t test_vshrq_n_s16(int16x8_t a) { return vshrq_n_s16(a, 1); } -// CHECK-LABEL: test_vshrq_n_s32 -// CHECK: vshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHR_N]] int32x4_t test_vshrq_n_s32(int32x4_t a) { return vshrq_n_s32(a, 1); } -// CHECK-LABEL: test_vshrq_n_s64 -// CHECK: vshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHR_N]] int64x2_t test_vshrq_n_s64(int64x2_t a) { return vshrq_n_s64(a, 1); } -// CHECK-LABEL: test_vshrq_n_u8 -// CHECK: vshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 { +// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, +// CHECK: ret <16 x i8> [[VSHR_N]] uint8x16_t test_vshrq_n_u8(uint8x16_t a) { return vshrq_n_u8(a, 1); } -// CHECK-LABEL: test_vshrq_n_u16 -// CHECK: vshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], +// CHECK: ret <8 x i16> [[VSHR_N]] uint16x8_t test_vshrq_n_u16(uint16x8_t a) { return vshrq_n_u16(a, 1); } -// CHECK-LABEL: test_vshrq_n_u32 -// CHECK: vshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], +// CHECK: ret <4 x i32> [[VSHR_N]] uint32x4_t test_vshrq_n_u32(uint32x4_t a) { return vshrq_n_u32(a, 1); } -// CHECK-LABEL: test_vshrq_n_u64 -// CHECK: vshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], +// CHECK: ret <2 x i64> [[VSHR_N]] uint64x2_t test_vshrq_n_u64(uint64x2_t a) { return vshrq_n_u64(a, 1); } -// CHECK-LABEL: test_vsli_n_s8 -// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSLI_N]] int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsli_n_s16 -// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSLI_N]]2 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsli_n_s32 -// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 1) +// CHECK: ret <2 x i32> [[VSLI_N]]2 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsli_n_s64 -// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSLI_N]]2 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsli_n_u8 -// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSLI_N]] uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsli_n_u16 -// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSLI_N]]2 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsli_n_u32 -// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N]]1, i32 1) +// CHECK: ret <2 x i32> [[VSLI_N]]2 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsli_n_u64 -// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSLI_N]]2 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsli_n_p8 -// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSLI_N]] poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, 1); } -// CHECK-LABEL: test_vsli_n_p16 -// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSLI_N]]2 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_s8 -// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSLI_N]] int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { return vsliq_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_s16 -// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSLI_N]]2 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { return vsliq_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_s32 -// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 1) +// CHECK: ret <4 x i32> [[VSLI_N]]2 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { return vsliq_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_s64 -// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <2 x i64> [[VSLI_N]]2 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_u8 -// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSLI_N]] uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { return vsliq_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_u16 -// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSLI_N]]2 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { return vsliq_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_u32 -// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N]]1, i32 1) +// CHECK: ret <4 x i32> [[VSLI_N]]2 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { return vsliq_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_u64 -// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N]]1, i32 1) +// CHECK: ret <2 x i64> [[VSLI_N]]2 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_p8 -// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSLI_N]] poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, 1); } -// CHECK-LABEL: test_vsliq_n_p16 -// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSLI_N]]2 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, 1); } -// CHECK-LABEL: test_vsra_n_s8 -// CHECK: vsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] +// CHECK: ret <8 x i8> [[TMP0]] int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsra_n_s16 -// CHECK: vsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i16> [[TMP4]] int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsra_n_s32 -// CHECK: vsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i32> [[TMP4]] int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsra_n_s64 -// CHECK: vsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <1 x i64> [[TMP4]] int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsra_n_u8 -// CHECK: vsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] +// CHECK: ret <8 x i8> [[TMP0]] uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsra_n_u16 -// CHECK: vsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i16> [[TMP4]] uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsra_n_u32 -// CHECK: vsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i32> [[TMP4]] uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsra_n_u64 -// CHECK: vsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <1 x i64> [[TMP4]] uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_s8 -// CHECK: vsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] +// CHECK: ret <16 x i8> [[TMP0]] int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_s16 -// CHECK: vsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <8 x i16> [[TMP4]] int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_s32 -// CHECK: vsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i32> [[TMP4]] int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_s64 -// CHECK: vsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i64> [[TMP4]] int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_u8 -// CHECK: vsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, +// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] +// CHECK: ret <16 x i8> [[TMP0]] uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_u16 -// CHECK: vsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK: ret <8 x i16> [[TMP4]] uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_u32 -// CHECK: vsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK: ret <4 x i32> [[TMP4]] uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsraq_n_u64 -// CHECK: vsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], +// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK: ret <2 x i64> [[TMP4]] uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsri_n_s8 -// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSRI_N]] int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsri_n_s16 -// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSRI_N]]2 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsri_n_s32 -// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 1) +// CHECK: ret <2 x i32> [[VSRI_N]]2 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsri_n_s64 -// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSRI_N]]2 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsri_n_u8 -// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSRI_N]] uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsri_n_u16 -// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSRI_N]]2 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsri_n_u32 -// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N]]1, i32 1) +// CHECK: ret <2 x i32> [[VSRI_N]]2 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsri_n_u64 -// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <1 x i64> [[VSRI_N]]2 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsri_n_p8 -// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 1) +// CHECK: ret <8 x i8> [[VSRI_N]] poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, 1); } -// CHECK-LABEL: test_vsri_n_p16 -// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <4 x i16> [[VSRI_N]]2 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_s8 -// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSRI_N]] int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { return vsriq_n_s8(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_s16 -// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSRI_N]]2 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { return vsriq_n_s16(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_s32 -// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 1) +// CHECK: ret <4 x i32> [[VSRI_N]]2 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { return vsriq_n_s32(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_s64 -// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <2 x i64> [[VSRI_N]]2 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_u8 -// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSRI_N]] uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) { return vsriq_n_u8(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_u16 -// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSRI_N]]2 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) { return vsriq_n_u16(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_u32 -// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N]]1, i32 1) +// CHECK: ret <4 x i32> [[VSRI_N]]2 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) { return vsriq_n_u32(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_u64 -// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N]]1, i32 1) +// CHECK: ret <2 x i64> [[VSRI_N]]2 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_p8 -// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) +// CHECK: ret <16 x i8> [[VSRI_N]] poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, 1); } -// CHECK-LABEL: test_vsriq_n_p16 -// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N]]1, i32 1) +// CHECK: ret <8 x i16> [[VSRI_N]]2 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, 1); } -// CHECK-LABEL: test_vst1q_u8 -// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_u8(uint8_t * a, uint8x16_t b) { vst1q_u8(a, b); } -// CHECK-LABEL: test_vst1q_u16 -// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_u16(uint16_t * a, uint16x8_t b) { vst1q_u16(a, b); } -// CHECK-LABEL: test_vst1q_u32 -// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1q_u32(uint32_t * a, uint32x4_t b) { vst1q_u32(a, b); } -// CHECK-LABEL: test_vst1q_u64 -// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1q_u64(uint64_t * a, uint64x2_t b) { vst1q_u64(a, b); } -// CHECK-LABEL: test_vst1q_s8 -// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_s8(int8_t * a, int8x16_t b) { vst1q_s8(a, b); } -// CHECK-LABEL: test_vst1q_s16 -// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_s16(int16_t * a, int16x8_t b) { vst1q_s16(a, b); } -// CHECK-LABEL: test_vst1q_s32 -// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1q_s32(int32_t * a, int32x4_t b) { vst1q_s32(a, b); } -// CHECK-LABEL: test_vst1q_s64 -// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1q_s64(int64_t * a, int64x2_t b) { vst1q_s64(a, b); } -// CHECK-LABEL: test_vst1q_f16 -// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_f16(float16_t * a, float16x8_t b) { vst1q_f16(a, b); } -// CHECK-LABEL: test_vst1q_f32 -// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: store <4 x float> [[TMP3]], <4 x float>* [[TMP2]] +// CHECK: ret void void test_vst1q_f32(float32_t * a, float32x4_t b) { vst1q_f32(a, b); } -// CHECK-LABEL: test_vst1q_p8 -// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>* +// CHECK: store <16 x i8> %b, <16 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1q_p8(poly8_t * a, poly8x16_t b) { vst1q_p8(a, b); } -// CHECK-LABEL: test_vst1q_p16 -// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1q_p16(poly16_t * a, poly16x8_t b) { vst1q_p16(a, b); } -// CHECK-LABEL: test_vst1_u8 -// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_u8(uint8_t * a, uint8x8_t b) { vst1_u8(a, b); } -// CHECK-LABEL: test_vst1_u16 -// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_u16(uint16_t * a, uint16x4_t b) { vst1_u16(a, b); } -// CHECK-LABEL: test_vst1_u32 -// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1_u32(uint32_t * a, uint32x2_t b) { vst1_u32(a, b); } -// CHECK-LABEL: test_vst1_u64 -// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1_u64(uint64_t * a, uint64x1_t b) { vst1_u64(a, b); } -// CHECK-LABEL: test_vst1_s8 -// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_s8(int8_t * a, int8x8_t b) { vst1_s8(a, b); } -// CHECK-LABEL: test_vst1_s16 -// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_s16(int16_t * a, int16x4_t b) { vst1_s16(a, b); } -// CHECK-LABEL: test_vst1_s32 -// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]] +// CHECK: ret void void test_vst1_s32(int32_t * a, int32x2_t b) { vst1_s32(a, b); } -// CHECK-LABEL: test_vst1_s64 -// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}] +// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]] +// CHECK: ret void void test_vst1_s64(int64_t * a, int64x1_t b) { vst1_s64(a, b); } -// CHECK-LABEL: test_vst1_f16 -// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_f16(float16_t * a, float16x4_t b) { vst1_f16(a, b); } -// CHECK-LABEL: test_vst1_f32 -// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: store <2 x float> [[TMP3]], <2 x float>* [[TMP2]] +// CHECK: ret void void test_vst1_f32(float32_t * a, float32x2_t b) { vst1_f32(a, b); } -// CHECK-LABEL: test_vst1_p8 -// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>* +// CHECK: store <8 x i8> %b, <8 x i8>* [[TMP0]] +// CHECK: ret void void test_vst1_p8(poly8_t * a, poly8x8_t b) { vst1_p8(a, b); } -// CHECK-LABEL: test_vst1_p16 -// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]] +// CHECK: ret void void test_vst1_p16(poly16_t * a, poly16x4_t b) { vst1_p16(a, b); } -// CHECK-LABEL: test_vst1q_lane_u8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) { vst1q_lane_u8(a, b, 15); } -// CHECK-LABEL: test_vst1q_lane_u16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) { vst1q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vst1q_lane_u32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) { vst1q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vst1q_lane_u64 -// CHECK: {{str|vstr|vmov}} +// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) { vst1q_lane_u64(a, b, 1); } -// CHECK-LABEL: test_vst1q_lane_s8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_s8(int8_t * a, int8x16_t b) { vst1q_lane_s8(a, b, 15); } -// CHECK-LABEL: test_vst1q_lane_s16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s16(int16_t * a, int16x8_t b) { vst1q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vst1q_lane_s32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s32(int32_t * a, int32x4_t b) { vst1q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vst1q_lane_s64 -// CHECK: {{str|vstr|vmov}} +// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_s64(int64_t * a, int64x2_t b) { vst1q_lane_s64(a, b, 1); } -// CHECK-LABEL: test_vst1q_lane_f16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_f16(float16_t * a, float16x8_t b) { vst1q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vst1q_lane_f32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: store float [[TMP3]], float* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_f32(float32_t * a, float32x4_t b) { vst1q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vst1q_lane_p8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) { vst1q_lane_p8(a, b, 15); } -// CHECK-LABEL: test_vst1q_lane_p16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) { vst1q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vst1_lane_u8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) { vst1_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vst1_lane_u16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) { vst1_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vst1_lane_u32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) { vst1_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vst1_lane_u64 -// CHECK: {{str|vstr|vmov}} +// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) { vst1_lane_u64(a, b, 0); } -// CHECK-LABEL: test_vst1_lane_s8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_s8(int8_t * a, int8x8_t b) { vst1_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vst1_lane_s16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s16(int16_t * a, int16x4_t b) { vst1_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vst1_lane_s32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i32* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32* +// CHECK: store i32 [[TMP3]], i32* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s32(int32_t * a, int32x2_t b) { vst1_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vst1_lane_s64 -// CHECK: {{str|vstr|vmov}} +// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i64* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +// CHECK: store i64 [[TMP3]], i64* [[TMP4]] +// CHECK: ret void void test_vst1_lane_s64(int64_t * a, int64x1_t b) { vst1_lane_s64(a, b, 0); } -// CHECK-LABEL: test_vst1_lane_f16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_f16(float16_t * a, float16x4_t b) { vst1_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vst1_lane_f32 -// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32] +// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast float* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float* +// CHECK: store float [[TMP3]], float* [[TMP4]] +// CHECK: ret void void test_vst1_lane_f32(float32_t * a, float32x2_t b) { vst1_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vst1_lane_p8 -// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 +// CHECK: store i8 [[TMP0]], i8* %a +// CHECK: ret void void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) { vst1_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vst1_lane_p16 -// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16] +// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast i16* %a to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16* +// CHECK: store i16 [[TMP3]], i16* [[TMP4]] +// CHECK: ret void void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) { vst1_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vst2q_u8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) { vst2q_u8(a, b); } -// CHECK-LABEL: test_vst2q_u16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) { vst2q_u16(a, b); } -// CHECK-LABEL: test_vst2q_u32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) { vst2q_u32(a, b); } -// CHECK-LABEL: test_vst2q_s8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_s8(int8_t * a, int8x16x2_t b) { vst2q_s8(a, b); } -// CHECK-LABEL: test_vst2q_s16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_s16(int16_t * a, int16x8x2_t b) { vst2q_s16(a, b); } -// CHECK-LABEL: test_vst2q_s32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_s32(int32_t * a, int32x4x2_t b) { vst2q_s32(a, b); } -// CHECK-LABEL: test_vst2q_f16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_f16(float16_t * a, float16x8x2_t b) { vst2q_f16(a, b); } -// CHECK-LABEL: test_vst2q_f32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_f32(float32_t * a, float32x4x2_t b) { vst2q_f32(a, b); } -// CHECK-LABEL: test_vst2q_p8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) { vst2q_p8(a, b); } -// CHECK-LABEL: test_vst2q_p16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) { vst2q_p16(a, b); } -// CHECK-LABEL: test_vst2_u8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_u8(uint8_t * a, uint8x8x2_t b) { vst2_u8(a, b); } -// CHECK-LABEL: test_vst2_u16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u16(uint16_t * a, uint16x4x2_t b) { vst2_u16(a, b); } -// CHECK-LABEL: test_vst2_u32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u32(uint32_t * a, uint32x2x2_t b) { vst2_u32(a, b); } -// CHECK-LABEL: test_vst2_u64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_u64(uint64_t * a, uint64x1x2_t b) { vst2_u64(a, b); } -// CHECK-LABEL: test_vst2_s8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_s8(int8_t * a, int8x8x2_t b) { vst2_s8(a, b); } -// CHECK-LABEL: test_vst2_s16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s16(int16_t * a, int16x4x2_t b) { vst2_s16(a, b); } -// CHECK-LABEL: test_vst2_s32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s32(int32_t * a, int32x2x2_t b) { vst2_s32(a, b); } -// CHECK-LABEL: test_vst2_s64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_s64(int64_t * a, int64x1x2_t b) { vst2_s64(a, b); } -// CHECK-LABEL: test_vst2_f16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_f16(float16_t * a, float16x4x2_t b) { vst2_f16(a, b); } -// CHECK-LABEL: test_vst2_f32 -// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_f32(float32_t * a, float32x2x2_t b) { vst2_f32(a, b); } -// CHECK-LABEL: test_vst2_p8 -// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a) +// CHECK: ret void void test_vst2_p8(poly8_t * a, poly8x8x2_t b) { vst2_p8(a, b); } -// CHECK-LABEL: test_vst2_p16 -// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]]) +// CHECK: ret void void test_vst2_p16(poly16_t * a, poly16x4x2_t b) { vst2_p16(a, b); } -// CHECK-LABEL: test_vst2q_lane_u16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) { vst2q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vst2q_lane_u32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) { vst2q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vst2q_lane_s16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) { vst2q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vst2q_lane_s32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) { vst2q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vst2q_lane_f16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) { vst2q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vst2q_lane_f32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) { vst2q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vst2q_lane_p16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) { vst2q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vst2_lane_u8 -// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) { vst2_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vst2_lane_u16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) { vst2_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vst2_lane_u32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) { vst2_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vst2_lane_s8 -// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) { vst2_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vst2_lane_s16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) { vst2_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vst2_lane_s32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) { vst2_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vst2_lane_f16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) { vst2_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vst2_lane_f32 -// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) { vst2_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vst2_lane_p8 -// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a) +// CHECK: ret void void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) { vst2_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vst2_lane_p16 -// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) { vst2_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vst3q_u8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) { vst3q_u8(a, b); } -// CHECK-LABEL: test_vst3q_u16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) { vst3q_u16(a, b); } -// CHECK-LABEL: test_vst3q_u32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) { vst3q_u32(a, b); } -// CHECK-LABEL: test_vst3q_s8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_s8(int8_t * a, int8x16x3_t b) { vst3q_s8(a, b); } -// CHECK-LABEL: test_vst3q_s16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_s16(int16_t * a, int16x8x3_t b) { vst3q_s16(a, b); } -// CHECK-LABEL: test_vst3q_s32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_s32(int32_t * a, int32x4x3_t b) { vst3q_s32(a, b); } -// CHECK-LABEL: test_vst3q_f16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_f16(float16_t * a, float16x8x3_t b) { vst3q_f16(a, b); } -// CHECK-LABEL: test_vst3q_f32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_f32(float32_t * a, float32x4x3_t b) { vst3q_f32(a, b); } -// CHECK-LABEL: test_vst3q_p8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) { vst3q_p8(a, b); } -// CHECK-LABEL: test_vst3q_p16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) { vst3q_p16(a, b); } -// CHECK-LABEL: test_vst3_u8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_u8(uint8_t * a, uint8x8x3_t b) { vst3_u8(a, b); } -// CHECK-LABEL: test_vst3_u16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u16(uint16_t * a, uint16x4x3_t b) { vst3_u16(a, b); } -// CHECK-LABEL: test_vst3_u32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u32(uint32_t * a, uint32x2x3_t b) { vst3_u32(a, b); } -// CHECK-LABEL: test_vst3_u64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_u64(uint64_t * a, uint64x1x3_t b) { vst3_u64(a, b); } -// CHECK-LABEL: test_vst3_s8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_s8(int8_t * a, int8x8x3_t b) { vst3_s8(a, b); } -// CHECK-LABEL: test_vst3_s16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s16(int16_t * a, int16x4x3_t b) { vst3_s16(a, b); } -// CHECK-LABEL: test_vst3_s32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s32(int32_t * a, int32x2x3_t b) { vst3_s32(a, b); } -// CHECK-LABEL: test_vst3_s64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_s64(int64_t * a, int64x1x3_t b) { vst3_s64(a, b); } -// CHECK-LABEL: test_vst3_f16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_f16(float16_t * a, float16x4x3_t b) { vst3_f16(a, b); } -// CHECK-LABEL: test_vst3_f32 -// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_f32(float32_t * a, float32x2x3_t b) { vst3_f32(a, b); } -// CHECK-LABEL: test_vst3_p8 -// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a) +// CHECK: ret void void test_vst3_p8(poly8_t * a, poly8x8x3_t b) { vst3_p8(a, b); } -// CHECK-LABEL: test_vst3_p16 -// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_p16(poly16_t * a, poly16x4x3_t b) { vst3_p16(a, b); } -// CHECK-LABEL: test_vst3q_lane_u16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) { vst3q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vst3q_lane_u32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) { vst3q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vst3q_lane_s16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) { vst3q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vst3q_lane_s32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) { vst3q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vst3q_lane_f16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) { vst3q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vst3q_lane_f32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) { vst3q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vst3q_lane_p16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) { vst3q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vst3_lane_u8 -// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) { vst3_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vst3_lane_u16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) { vst3_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vst3_lane_u32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) { vst3_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vst3_lane_s8 -// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) { vst3_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vst3_lane_s16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) { vst3_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vst3_lane_s32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) { vst3_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vst3_lane_f16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) { vst3_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vst3_lane_f32 -// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) { vst3_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vst3_lane_p8 -// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a) +// CHECK: ret void void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) { vst3_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vst3_lane_p16 -// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) { vst3_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vst4q_u8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) { vst4q_u8(a, b); } -// CHECK-LABEL: test_vst4q_u16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) { vst4q_u16(a, b); } -// CHECK-LABEL: test_vst4q_u32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) { vst4q_u32(a, b); } -// CHECK-LABEL: test_vst4q_s8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_s8(int8_t * a, int8x16x4_t b) { vst4q_s8(a, b); } -// CHECK-LABEL: test_vst4q_s16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_s16(int16_t * a, int16x8x4_t b) { vst4q_s16(a, b); } -// CHECK-LABEL: test_vst4q_s32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_s32(int32_t * a, int32x4x4_t b) { vst4q_s32(a, b); } -// CHECK-LABEL: test_vst4q_f16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_f16(float16_t * a, float16x8x4_t b) { vst4q_f16(a, b); } -// CHECK-LABEL: test_vst4q_f32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_f32(float32_t * a, float32x4x4_t b) { vst4q_f32(a, b); } -// CHECK-LABEL: test_vst4q_p8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16 +// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) { vst4q_p8(a, b); } -// CHECK-LABEL: test_vst4q_p16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}} +// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) { vst4q_p16(a, b); } -// CHECK-LABEL: test_vst4_u8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_u8(uint8_t * a, uint8x8x4_t b) { vst4_u8(a, b); } -// CHECK-LABEL: test_vst4_u16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u16(uint16_t * a, uint16x4x4_t b) { vst4_u16(a, b); } -// CHECK-LABEL: test_vst4_u32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u32(uint32_t * a, uint32x2x4_t b) { vst4_u32(a, b); } -// CHECK-LABEL: test_vst4_u64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_u64(uint64_t * a, uint64x1x4_t b) { vst4_u64(a, b); } -// CHECK-LABEL: test_vst4_s8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_s8(int8_t * a, int8x8x4_t b) { vst4_s8(a, b); } -// CHECK-LABEL: test_vst4_s16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s16(int16_t * a, int16x4x4_t b) { vst4_s16(a, b); } -// CHECK-LABEL: test_vst4_s32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s32(int32_t * a, int32x2x4_t b) { vst4_s32(a, b); } -// CHECK-LABEL: test_vst4_s64 -// CHECK: vst1.64 +// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64> +// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_s64(int64_t * a, int64x1x4_t b) { vst4_s64(a, b); } -// CHECK-LABEL: test_vst4_f16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_f16(float16_t * a, float16x4x4_t b) { vst4_f16(a, b); } -// CHECK-LABEL: test_vst4_f32 -// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_f32(float32_t * a, float32x2x4_t b) { vst4_f32(a, b); } -// CHECK-LABEL: test_vst4_p8 -// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a) +// CHECK: ret void void test_vst4_p8(poly8_t * a, poly8x8x4_t b) { vst4_p8(a, b); } -// CHECK-LABEL: test_vst4_p16 -// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_p16(poly16_t * a, poly16x4x4_t b) { vst4_p16(a, b); } -// CHECK-LABEL: test_vst4q_lane_u16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) { vst4q_lane_u16(a, b, 7); } -// CHECK-LABEL: test_vst4q_lane_u32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) { vst4q_lane_u32(a, b, 3); } -// CHECK-LABEL: test_vst4q_lane_s16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) { vst4q_lane_s16(a, b, 7); } -// CHECK-LABEL: test_vst4q_lane_s32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) { vst4q_lane_s32(a, b, 3); } -// CHECK-LABEL: test_vst4q_lane_f16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) { vst4q_lane_f16(a, b, 7); } -// CHECK-LABEL: test_vst4q_lane_f32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) { vst4q_lane_f32(a, b, 3); } -// CHECK-LABEL: test_vst4q_lane_p16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]} +// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16 +// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16 +// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16 +// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16 +// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]]) +// CHECK: ret void void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) { vst4q_lane_p16(a, b, 7); } -// CHECK-LABEL: test_vst4_lane_u8 -// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) { vst4_lane_u8(a, b, 7); } -// CHECK-LABEL: test_vst4_lane_u16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) { vst4_lane_u16(a, b, 3); } -// CHECK-LABEL: test_vst4_lane_u32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) { vst4_lane_u32(a, b, 1); } -// CHECK-LABEL: test_vst4_lane_s8 -// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) { vst4_lane_s8(a, b, 7); } -// CHECK-LABEL: test_vst4_lane_s16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) { vst4_lane_s16(a, b, 3); } -// CHECK-LABEL: test_vst4_lane_s32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) { vst4_lane_s32(a, b, 1); } -// CHECK-LABEL: test_vst4_lane_f16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) { vst4_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vst4_lane_f32 -// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float> +// CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i64 1, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) { vst4_lane_f32(a, b, 1); } -// CHECK-LABEL: test_vst4_lane_p8 -// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8 +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8 +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8 +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8 +// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a) +// CHECK: ret void void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) { vst4_lane_p8(a, b, 7); } -// CHECK-LABEL: test_vst4_lane_p16 -// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}] +// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 { +// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false) +// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8* +// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0 +// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8 +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1 +// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8 +// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> +// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2 +// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8 +// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3 +// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8 +// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16> +// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]]) +// CHECK: ret void void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) { vst4_lane_p16(a, b, 3); } -// CHECK-LABEL: test_vsub_s8 -// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[SUB_I]] int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) { return vsub_s8(a, b); } -// CHECK-LABEL: test_vsub_s16 -// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[SUB_I]] int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) { return vsub_s16(a, b); } -// CHECK-LABEL: test_vsub_s32 -// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[SUB_I]] int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) { return vsub_s32(a, b); } -// CHECK-LABEL: test_vsub_s64 -// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[SUB_I]] int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) { return vsub_s64(a, b); } -// CHECK-LABEL: test_vsub_f32 -// CHECK: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, %b +// CHECK: ret <2 x float> [[SUB_I]] float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) { return vsub_f32(a, b); } -// CHECK-LABEL: test_vsub_u8 -// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b +// CHECK: ret <8 x i8> [[SUB_I]] uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) { return vsub_u8(a, b); } -// CHECK-LABEL: test_vsub_u16 -// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b +// CHECK: ret <4 x i16> [[SUB_I]] uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) { return vsub_u16(a, b); } -// CHECK-LABEL: test_vsub_u32 -// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b +// CHECK: ret <2 x i32> [[SUB_I]] uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) { return vsub_u32(a, b); } -// CHECK-LABEL: test_vsub_u64 -// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b +// CHECK: ret <1 x i64> [[SUB_I]] uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) { return vsub_u64(a, b); } -// CHECK-LABEL: test_vsubq_s8 -// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[SUB_I]] int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) { return vsubq_s8(a, b); } -// CHECK-LABEL: test_vsubq_s16 -// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) { return vsubq_s16(a, b); } -// CHECK-LABEL: test_vsubq_s32 -// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) { return vsubq_s32(a, b); } -// CHECK-LABEL: test_vsubq_s64 -// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) { return vsubq_s64(a, b); } -// CHECK-LABEL: test_vsubq_f32 -// CHECK: vsub.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, %b +// CHECK: ret <4 x float> [[SUB_I]] float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) { return vsubq_f32(a, b); } -// CHECK-LABEL: test_vsubq_u8 -// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b +// CHECK: ret <16 x i8> [[SUB_I]] uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) { return vsubq_u8(a, b); } -// CHECK-LABEL: test_vsubq_u16 -// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) { return vsubq_u16(a, b); } -// CHECK-LABEL: test_vsubq_u32 -// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) { return vsubq_u32(a, b); } -// CHECK-LABEL: test_vsubq_u64 -// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) { return vsubq_u64(a, b); } -// CHECK-LABEL: test_vsubhn_s16 -// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSUBHN2_I]] int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); } -// CHECK-LABEL: test_vsubhn_s32 -// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSUBHN2_I]] int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); } -// CHECK-LABEL: test_vsubhn_s64 -// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSUBHN2_I]] int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); } -// CHECK-LABEL: test_vsubhn_u16 -// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK: ret <8 x i8> [[VSUBHN2_I]] uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); } -// CHECK-LABEL: test_vsubhn_u32 -// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK: ret <4 x i16> [[VSUBHN2_I]] uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); } -// CHECK-LABEL: test_vsubhn_u64 -// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], +// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK: ret <2 x i32> [[VSUBHN2_I]] uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); } -// CHECK-LABEL: test_vsubl_s8 -// CHECK: vsubl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); } -// CHECK-LABEL: test_vsubl_s16 -// CHECK: vsubl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); } -// CHECK-LABEL: test_vsubl_s32 -// CHECK: vsubl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); } -// CHECK-LABEL: test_vsubl_u8 -// CHECK: vsubl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); } -// CHECK-LABEL: test_vsubl_u16 -// CHECK: vsubl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); } -// CHECK-LABEL: test_vsubl_u32 -// CHECK: vsubl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); } -// CHECK-LABEL: test_vsubw_s8 -// CHECK: vsubw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); } -// CHECK-LABEL: test_vsubw_s16 -// CHECK: vsubw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); } -// CHECK-LABEL: test_vsubw_s32 -// CHECK: vsubw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); } -// CHECK-LABEL: test_vsubw_u8 -// CHECK: vsubw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 { +// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> +// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] +// CHECK: ret <8 x i16> [[SUB_I]] uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); } -// CHECK-LABEL: test_vsubw_u16 -// CHECK: vsubw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] +// CHECK: ret <4 x i32> [[SUB_I]] uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); } -// CHECK-LABEL: test_vsubw_u32 -// CHECK: vsubw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] +// CHECK: ret <2 x i64> [[SUB_I]] uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); } -// CHECK-LABEL: test_vtbl1_u8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL11_I]] uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) { return vtbl1_u8(a, b); } -// CHECK-LABEL: test_vtbl1_s8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL11_I]] int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) { return vtbl1_s8(a, b); } -// CHECK-LABEL: test_vtbl1_p8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL11_I]] poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) { return vtbl1_p8(a, b); } -// CHECK-LABEL: test_vtbl2_u8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL13_I]] uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) { return vtbl2_u8(a, b); } -// CHECK-LABEL: test_vtbl2_s8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL13_I]] int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) { return vtbl2_s8(a, b); } -// CHECK-LABEL: test_vtbl2_p8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL13_I]] poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) { return vtbl2_p8(a, b); } -// CHECK-LABEL: test_vtbl3_u8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL26_I]] uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) { return vtbl3_u8(a, b); } -// CHECK-LABEL: test_vtbl3_s8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL26_I]] int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) { return vtbl3_s8(a, b); } -// CHECK-LABEL: test_vtbl3_p8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL26_I]] poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) { return vtbl3_p8(a, b); } -// CHECK-LABEL: test_vtbl4_u8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL28_I]] uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) { return vtbl4_u8(a, b); } -// CHECK-LABEL: test_vtbl4_s8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL28_I]] int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) { return vtbl4_s8(a, b); } -// CHECK-LABEL: test_vtbl4_p8 -// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 { +// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx.i, align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx2.i, align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx4.i, align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[A]]rrayidx6.i, align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #4 +// CHECK: ret <8 x i8> [[VTBL28_I]] poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) { return vtbl4_p8(a, b); } -// CHECK-LABEL: test_vtbx1_u8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vtbx1_u8(a, b, c); } -// CHECK-LABEL: test_vtbx1_s8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vtbx1_s8(a, b, c); } -// CHECK-LABEL: test_vtbx1_p8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { +// CHECK: [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #4 +// CHECK: [[TMP0:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a +// CHECK: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], +// CHECK: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK: ret <8 x i8> [[VTBX_I]] poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) { return vtbx1_p8(a, b, c); } -// CHECK-LABEL: test_vtbx2_u8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX13_I]] uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) { return vtbx2_u8(a, b, c); } -// CHECK-LABEL: test_vtbx2_s8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX13_I]] int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) { return vtbx2_s8(a, b, c); } -// CHECK-LABEL: test_vtbx2_p8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX13_I]] poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) { return vtbx2_p8(a, b, c); } -// CHECK-LABEL: test_vtbx3_u8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) { return vtbx3_u8(a, b, c); } -// CHECK-LABEL: test_vtbx3_s8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) { return vtbx3_s8(a, b, c); } -// CHECK-LABEL: test_vtbx3_p8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #4 +// CHECK: [[TMP4:%.*]] = icmp uge <8 x i8> %c, +// CHECK: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8> +// CHECK: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a +// CHECK: [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], +// CHECK: [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]] +// CHECK: [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]] +// CHECK: ret <8 x i8> [[VTBX_I]] poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) { return vtbx3_p8(a, b, c); } -// CHECK-LABEL: test_vtbx4_u8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX28_I]] uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) { return vtbx4_u8(a, b, c); } -// CHECK-LABEL: test_vtbx4_s8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX28_I]] int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) { return vtbx4_s8(a, b, c); } -// CHECK-LABEL: test_vtbx4_p8 -// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 { +// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 +// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8 +// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0 +// CHECK: [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE]]1, align 8 +// CHECK: [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE]].i, align 8 +// CHECK: [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0 +// CHECK: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8 +// CHECK: [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1 +// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8 +// CHECK: [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2 +// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8 +// CHECK: [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0 +// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3 +// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8 +// CHECK: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +// CHECK: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +// CHECK: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #4 +// CHECK: ret <8 x i8> [[VTBX28_I]] poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) { return vtbx4_p8(a, b, c); } -// CHECK-LABEL: test_vtrn_s8 -// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK-LABEL: test_vtrn_s16 -// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK-LABEL: test_vtrn_s32 -// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK-LABEL: test_vtrn_u8 -// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK-LABEL: test_vtrn_u16 -// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK-LABEL: test_vtrn_u32 -// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK-LABEL: test_vtrn_f32 -// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK-LABEL: test_vtrn_p8 -// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK-LABEL: test_vtrn_p16 -// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK-LABEL: test_vtrnq_s8 -// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK-LABEL: test_vtrnq_s16 -// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK-LABEL: test_vtrnq_s32 -// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK-LABEL: test_vtrnq_u8 -// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK-LABEL: test_vtrnq_u16 -// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK-LABEL: test_vtrnq_u32 -// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK-LABEL: test_vtrnq_f32 -// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK-LABEL: test_vtrnq_p8 -// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK-LABEL: test_vtrnq_p16 -// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); } -// CHECK-LABEL: test_vtst_s8 -// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) { return vtst_s8(a, b); } -// CHECK-LABEL: test_vtst_s16 -// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) { return vtst_s16(a, b); } -// CHECK-LABEL: test_vtst_s32 -// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK: ret <2 x i32> [[VTST_I]] uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) { return vtst_s32(a, b); } -// CHECK-LABEL: test_vtst_u8 -// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) { return vtst_u8(a, b); } -// CHECK-LABEL: test_vtst_u16 -// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) { return vtst_u16(a, b); } -// CHECK-LABEL: test_vtst_u32 -// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK: ret <2 x i32> [[VTST_I]] uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) { return vtst_u32(a, b); } -// CHECK-LABEL: test_vtst_p8 -// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK: ret <8 x i8> [[VTST_I]] uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) { return vtst_p8(a, b); } -// CHECK-LABEL: test_vtst_p16 -// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK: ret <4 x i16> [[VTST_I]] uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) { return vtst_p16(a, b); } -// CHECK-LABEL: test_vtstq_s8 -// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) { return vtstq_s8(a, b); } -// CHECK-LABEL: test_vtstq_s16 -// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) { return vtstq_s16(a, b); } -// CHECK-LABEL: test_vtstq_s32 -// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK: ret <4 x i32> [[VTST_I]] uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) { return vtstq_s32(a, b); } -// CHECK-LABEL: test_vtstq_u8 -// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) { return vtstq_u8(a, b); } -// CHECK-LABEL: test_vtstq_u16 -// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) { return vtstq_u16(a, b); } -// CHECK-LABEL: test_vtstq_u32 -// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK: ret <4 x i32> [[VTST_I]] uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) { return vtstq_u32(a, b); } -// CHECK-LABEL: test_vtstq_p8 -// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b +// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK: ret <16 x i8> [[VTST_I]] uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) { return vtstq_p8(a, b); } -// CHECK-LABEL: test_vtstq_p16 -// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK: ret <8 x i16> [[VTST_I]] uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) { return vtstq_p16(a, b); } -// CHECK-LABEL: test_vuzp_s8 -// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK-LABEL: test_vuzp_s16 -// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK-LABEL: test_vuzp_s32 -// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK-LABEL: test_vuzp_u8 -// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK-LABEL: test_vuzp_u16 -// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK-LABEL: test_vuzp_u32 -// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK-LABEL: test_vuzp_f32 -// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK-LABEL: test_vuzp_p8 -// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK-LABEL: test_vuzp_p16 -// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK-LABEL: test_vuzpq_s8 -// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK-LABEL: test_vuzpq_s16 -// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK-LABEL: test_vuzpq_s32 -// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK-LABEL: test_vuzpq_u8 -// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK-LABEL: test_vuzpq_u16 -// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK-LABEL: test_vuzpq_u32 -// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK-LABEL: test_vuzpq_f32 -// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK-LABEL: test_vuzpq_p8 -// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK-LABEL: test_vuzpq_p16 -// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK-LABEL: test_vzip_s8 -// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int8x8x2_t [[TMP8]] int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK-LABEL: test_vzip_s16 -// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int16x4x2_t [[TMP1]]2 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK-LABEL: test_vzip_s32 -// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.int32x2x2_t [[TMP1]]2 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK-LABEL: test_vzip_u8 -// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint8x8x2_t [[TMP8]] uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK-LABEL: test_vzip_u16 -// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint16x4x2_t [[TMP1]]2 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK-LABEL: test_vzip_u32 -// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.uint32x2x2_t [[TMP1]]2 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK-LABEL: test_vzip_f32 -// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK: store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0 +// CHECK: store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.float32x2x2_t [[TMP1]]2 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK-LABEL: test_vzip_p8 -// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> +// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 +// CHECK: store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly8x8x2_t [[TMP8]] poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK-LABEL: test_vzip_p16 -// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> +// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8 +// CHECK: ret %struct.poly16x4x2_t [[TMP1]]2 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK-LABEL: test_vzipq_s8 -// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int8x16x2_t [[TMP8]] int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK-LABEL: test_vzipq_s16 -// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int16x8x2_t [[TMP1]]2 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK-LABEL: test_vzipq_s32 -// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.int32x4x2_t [[TMP1]]2 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK-LABEL: test_vzipq_u8 -// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint8x16x2_t [[TMP8]] uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK-LABEL: test_vzipq_u16 -// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint16x8x2_t [[TMP1]]2 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK-LABEL: test_vzipq_u32 -// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.uint32x4x2_t [[TMP1]]2 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK-LABEL: test_vzipq_f32 -// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK: store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0 +// CHECK: store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.float32x4x2_t [[TMP1]]2 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK-LABEL: test_vzipq_p8 -// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]] +// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> +// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]] +// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 +// CHECK: store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16 +// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly8x16x2_t [[TMP8]] poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK-LABEL: test_vzipq_p16 -// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}} +// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 { +// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 +// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>* +// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]] +// CHECK: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1 +// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> +// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]] +// CHECK: [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8* +// CHECK: [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #4 +// CHECK: [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16 +// CHECK: [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0 +// CHECK: [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0 +// CHECK: store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16 +// CHECK: [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16 +// CHECK: ret %struct.poly16x8x2_t [[TMP1]]2 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); } Index: test/CodeGen/builtins-arm-exclusive.c =================================================================== --- test/CodeGen/builtins-arm-exclusive.c +++ test/CodeGen/builtins-arm-exclusive.c @@ -1,32 +1,6 @@ -// REQUIRES: arm-registered-target -// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64 - -// Make sure the canonical use works before going into smaller details: -int atomic_inc(int *addr) { - int Failure, OldVal; - do { - OldVal = __builtin_arm_ldrex(addr); - Failure = __builtin_arm_strex(OldVal + 1, addr); - } while (Failure); - - return OldVal; -} +// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix=CHECK-ARM64 -// CHECK-LABEL: @atomic_inc -// CHECK: [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr) -// CHECK: [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1 -// CHECK: [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr) -// CHECK: [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0 -// CHECK: br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}} - -// CHECK-ARM64-LABEL: @atomic_inc -// CHECK-ARM64: [[OLDVAL:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr) -// CHECK-ARM64: [[INC:%.*]] = add i64 [[OLDVAL]], 1 -// CHECK-ARM64: [[TRUNC:%.*]] = and i64 [[INC]], 4294967295 -// CHECK-ARM64: [[FAILURE:%.*]] = tail call i32 @llvm.aarch64.stxr.p0i32(i64 [[TRUNC]], i32* %addr) -// CHECK-ARM64: [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0 -// CHECK-ARM64: br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}} struct Simple { char a, b; @@ -37,36 +11,33 @@ // CHECK-ARM64-LABEL: @test_ldrex int sum = 0; sum += __builtin_arm_ldrex(addr); -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr) -// CHECK: and i32 [[INTRES]], 255 +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %addr) +// CHECK: trunc i32 [[INTRES]] to i8 -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr) -// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 -// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24 -// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24 +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr) +// CHECK-ARM64: trunc i64 [[INTRES]] to i8 sum += __builtin_arm_ldrex((short *)addr); // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16* -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]]) -// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16 -// CHECK: ashr exact i32 [[TMPSEXT]], 16 +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]]) +// CHECK: trunc i32 [[INTRES]] to i16 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]]) -// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 -// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16 -// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16 +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]]) +// CHECK-ARM64: trunc i64 [[INTRES]] to i16 sum += __builtin_arm_ldrex((int *)addr); // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32* -// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]]) +// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]]) // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]]) +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]]) // CHECK-ARM64: trunc i64 [[INTRES]] to i32 sum += __builtin_arm_ldrex((long long *)addr); -// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64* +// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8* +// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]]) // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64* // CHECK-ARM64: call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]]) @@ -79,16 +50,18 @@ sum += __builtin_arm_ldrex(addrfloat); // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32* -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]]) +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]]) // CHECK: bitcast i32 [[INTRES]] to float // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]]) +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]]) // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float sum += __builtin_arm_ldrex((double *)addr); -// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8* +// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]]) // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64 @@ -97,21 +70,31 @@ // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]] // CHECK: bitcast i64 [[INTRES]] to double -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: bitcast i64 [[INTRES]] to double sum += *__builtin_arm_ldrex((int **)addr); -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32** +// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32* +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]]) // CHECK: inttoptr i32 [[INTRES]] to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32* sum += __builtin_arm_ldrex((struct Simple **)addr)->a; -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32* +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]]) // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple* return sum; } @@ -121,36 +104,33 @@ // CHECK-ARM64-LABEL: @test_ldaex int sum = 0; sum += __builtin_arm_ldaex(addr); -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i8(i8* %addr) -// CHECK: and i32 [[INTRES]], 255 +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %addr) +// CHECK: trunc i32 [[INTRES]] to i8 -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr) -// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 -// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24 -// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24 +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr) +// CHECK-ARM64: trunc i64 [[INTRES]] to i8 sum += __builtin_arm_ldaex((short *)addr); // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16* -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]]) -// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16 -// CHECK: ashr exact i32 [[TMPSEXT]], 16 +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]]) +// CHECK: trunc i32 [[INTRES]] to i16 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]]) -// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 -// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16 -// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16 +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]]) +// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i16 sum += __builtin_arm_ldaex((int *)addr); // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32* // CHECK: call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]]) // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]]) +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]]) // CHECK-ARM64: trunc i64 [[INTRES]] to i32 sum += __builtin_arm_ldaex((long long *)addr); -// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* %addr) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64* +// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8* +// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]]) // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64* // CHECK-ARM64: call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]]) @@ -163,16 +143,18 @@ sum += __builtin_arm_ldaex(addrfloat); // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32* -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]]) +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]]) // CHECK: bitcast i32 [[INTRES]] to float // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]]) +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]]) // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float sum += __builtin_arm_ldaex((double *)addr); -// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldaexd(i8* %addr) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8* +// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]]) // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64 @@ -181,21 +163,31 @@ // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]] // CHECK: bitcast i64 [[INTRES]] to double -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: bitcast i64 [[INTRES]] to double sum += *__builtin_arm_ldaex((int **)addr); -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32** +// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32* +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]]) // CHECK: inttoptr i32 [[INTRES]] to i32* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32* sum += __builtin_arm_ldaex((struct Simple **)addr)->a; -// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32* +// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]]) // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple* -// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64* +// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]]) // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple* return sum; } @@ -225,27 +217,51 @@ // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 42, i32* [[ADDR32]]) res |= __builtin_arm_strex(42, (long long *)addr); -// CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr) +// CHECK: store i64 42, i64* [[TMP:%.*]], align 8 +// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }* +// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]] +// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0 +// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64* +// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8* +// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]]) // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64* // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 42, i64* [[ADDR64]]) res |= __builtin_arm_strex(2.71828f, (float *)addr); -// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float* +// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32* +// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[TMP5]]) -// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32* +// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[TMP5]]) res |= __builtin_arm_strex(3.14159, (double *)addr); -// CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr) - -// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]]) +// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8 +// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }* +// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]] +// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0 +// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8* +// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]]) + +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64* +// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[TMP5]]) res |= __builtin_arm_strex(&var, (struct Simple **)addr); -// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32 -// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32* +// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32 +// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[TMP5]]) -// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64 -// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64* +// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64 +// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]]) return res; } @@ -275,27 +291,51 @@ // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 42, i32* [[ADDR32]]) res |= __builtin_arm_stlex(42, (long long *)addr); -// CHECK: call i32 @llvm.arm.stlexd(i32 42, i32 0, i8* %addr) +// CHECK: store i64 42, i64* [[TMP:%.*]], align 8 +// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }* +// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]] +// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0 +// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64* +// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8* +// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]]) // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64* // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 42, i64* [[ADDR64]]) res |= __builtin_arm_stlex(2.71828f, (float *)addr); -// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float* +// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32* +// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[TMP5]]) -// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[ADDR32]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32* +// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[TMP5]]) res |= __builtin_arm_stlex(3.14159, (double *)addr); -// CHECK: call i32 @llvm.arm.stlexd(i32 -266631570, i32 1074340345, i8* %addr) - -// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]]) +// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8 +// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }* +// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]] +// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0 +// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1 +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8* +// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]]) + +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double* +// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64* +// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[TMP5]]) res |= __builtin_arm_stlex(&var, (struct Simple **)addr); -// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32 -// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]]) +// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32* +// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32 +// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[TMP5]]) -// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64 -// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]]) +// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple** +// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64* +// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64 +// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]]) return res; } @@ -317,7 +357,7 @@ return __builtin_arm_ldrex(addr); // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8* -// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]]) +// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]]) // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128 @@ -331,11 +371,13 @@ // CHECK-ARM64-LABEL: @test_strex_128 return __builtin_arm_strex(val, addr); -// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64 -// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64 -// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64 +// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16 +// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }* +// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]] +// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0 +// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8* -// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]]) +// CHECK-ARM64: call i32 @llvm.aarch64.stxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]]) } __int128 test_ldaex_128(__int128 *addr) { @@ -343,7 +385,7 @@ return __builtin_arm_ldaex(addr); // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8* -// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]]) +// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]]) // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128 @@ -357,11 +399,13 @@ // CHECK-ARM64-LABEL: @test_stlex_128 return __builtin_arm_stlex(val, addr); -// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64 -// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64 -// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64 +// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16 +// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }* +// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]] +// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0 +// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8* -// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stlxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]]) +// CHECK-ARM64: [[RES:%.*]] = call i32 @llvm.aarch64.stlxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]]) } #endif Index: test/CodeGen/builtins-arm.c =================================================================== --- test/CodeGen/builtins-arm.c +++ test/CodeGen/builtins-arm.c @@ -1,5 +1,4 @@ -// REQUIRES: arm-registered-target -// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -O3 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s void *f0() { @@ -87,14 +86,14 @@ unsigned mrc() { // CHECK: define i32 @mrc() - // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3) + // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3) // CHECK-NEXT: ret i32 [[R]] return __builtin_arm_mrc(15, 0, 13, 0, 3); } unsigned mrc2() { // CHECK: define i32 @mrc2() - // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3) + // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3) // CHECK-NEXT: ret i32 [[R]] return __builtin_arm_mrc2(15, 0, 13, 0, 3); } @@ -124,40 +123,40 @@ } unsigned rsr() { - // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !7) + // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M0:.*]]) // CHECK-NEXT: ret i32 [[V0]] return __builtin_arm_rsr("cp1:2:c3:c4:5"); } unsigned long long rsr64() { - // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata !8) + // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M1:.*]]) // CHECK-NEXT: ret i64 [[V0]] return __builtin_arm_rsr64("cp1:2:c3"); } void *rsrp() { - // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !9) + // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M2:.*]]) // CHECK-NEXT: [[V1:[%A-Za-z0-9.]+]] = inttoptr i32 [[V0]] to i8* // CHECK-NEXT: ret i8* [[V1]] return __builtin_arm_rsrp("sysreg"); } void wsr(unsigned v) { - // CHECK: call void @llvm.write_register.i32(metadata !7, i32 %v) + // CHECK: call void @llvm.write_register.i32(metadata ![[M0]], i32 %v) __builtin_arm_wsr("cp1:2:c3:c4:5", v); } void wsr64(unsigned long long v) { - // CHECK: call void @llvm.write_register.i64(metadata !8, i64 %v) + // CHECK: call void @llvm.write_register.i64(metadata ![[M1]], i64 %v) __builtin_arm_wsr64("cp1:2:c3", v); } void wsrp(void *v) { // CHECK: [[V0:[%A-Za-z0-9.]+]] = ptrtoint i8* %v to i32 - // CHECK-NEXT: call void @llvm.write_register.i32(metadata !9, i32 [[V0]]) + // CHECK-NEXT: call void @llvm.write_register.i32(metadata ![[M2]], i32 [[V0]]) __builtin_arm_wsrp("sysreg", v); } -// CHECK: !7 = !{!"cp1:2:c3:c4:5"} -// CHECK: !8 = !{!"cp1:2:c3"} -// CHECK: !9 = !{!"sysreg"} +// CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"} +// CHECK: ![[M1]] = !{!"cp1:2:c3"} +// CHECK: ![[M2]] = !{!"sysreg"} Index: test/CodeGen/builtins-arm64.c =================================================================== --- test/CodeGen/builtins-arm64.c +++ test/CodeGen/builtins-arm64.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm64-apple-ios -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s void f0(void *a, void *b) { __clear_cache(a,b); @@ -50,7 +50,7 @@ } unsigned rsr() { - // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]]) + // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]]) // CHECK-NEXT: trunc i64 [[V0]] to i32 return __builtin_arm_rsr("1:2:3:4:5"); } @@ -61,7 +61,7 @@ } void *rsrp() { - // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]]) + // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]]) // CHECK-NEXT: inttoptr i64 [[V0]] to i8* return __builtin_arm_rsrp("1:2:3:4:5"); } Index: test/CodeGen/fp128_complex.c =================================================================== --- test/CodeGen/fp128_complex.c +++ test/CodeGen/fp128_complex.c @@ -1,9 +1,9 @@ -// RUN: %clang -target aarch64-linux-gnuabi %s -O3 -S -emit-llvm -o - | FileCheck %s +// RUN: %clang -target aarch64-linux-gnuabi %s -S -emit-llvm -o - | FileCheck %s _Complex long double a, b, c, d; void test_fp128_compound_assign(void) { - // CHECK: tail call { fp128, fp128 } @__multc3 + // CHECK: call { fp128, fp128 } @__multc3 a *= b; - // CHECK: tail call { fp128, fp128 } @__divtc3 + // CHECK: call { fp128, fp128 } @__divtc3 c /= d; } Index: test/CodeGen/neon-immediate-ubsan.c =================================================================== --- test/CodeGen/neon-immediate-ubsan.c +++ test/CodeGen/neon-immediate-ubsan.c @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -O1 -o - %s \ +// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -o - %s \ // RUN: -target-feature +neon -target-cpu cortex-a8 \ // RUN: -fsanitize=signed-integer-overflow \ // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ARMV7 -// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -O1 -o - %s \ +// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -o - %s \ // RUN: -target-feature +neon -target-cpu cortex-a53 \ // RUN: -fsanitize=signed-integer-overflow \ // RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=AARCH64