diff --git a/clang/test/CodeGen/arm-cde-vec.c b/clang/test/CodeGen/arm-cde-vec.c --- a/clang/test/CodeGen/arm-cde-vec.c +++ b/clang/test/CodeGen/arm-cde-vec.c @@ -155,8 +155,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 1, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[M:%.*]], i32 11, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v2i1(i32 1, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[M:%.*]], i32 11, <2 x i1> [[TMP2]]) // CHECK-NEXT: ret <2 x i64> [[TMP3]] // int64x2_t test_vcx3q_m(int64x2_t inactive, float32x4_t n, int8x16_t m, mve_pred16_t p) { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c --- a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c @@ -62,8 +62,8 @@ // CHECK-LABEL: @test_vctp64q( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP0]]) // CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 // CHECK-NEXT: ret i16 [[TMP2]] // @@ -75,10 +75,10 @@ // CHECK-LABEL: @test_vctp64q_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] -// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <2 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 // CHECK-NEXT: ret i16 [[TMP5]] // diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c --- a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c @@ -238,8 +238,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 664, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 664, <2 x i1> [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 // CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 @@ -254,8 +254,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 656, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 656, <2 x i1> [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 // CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 @@ -269,8 +269,8 @@ // CHECK-LABEL: @test_vldrdq_gather_base_z_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vldrdq_gather_base_z_s64(uint64x2_t addr, mve_pred16_t p) @@ -281,8 +281,8 @@ // CHECK-LABEL: @test_vldrdq_gather_base_z_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 -1000, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 -1000, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vldrdq_gather_base_z_u64(uint64x2_t addr, mve_pred16_t p) @@ -321,8 +321,8 @@ // CHECK-LABEL: @test_vldrdq_gather_offset_z_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vldrdq_gather_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) @@ -337,8 +337,8 @@ // CHECK-LABEL: @test_vldrdq_gather_offset_z_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vldrdq_gather_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) @@ -381,8 +381,8 @@ // CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vldrdq_gather_shifted_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) @@ -397,8 +397,8 @@ // CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vldrdq_gather_shifted_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) @@ -1229,8 +1229,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_base_p_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_base_p_s64(uint64x2_t addr, int64x2_t value, mve_pred16_t p) @@ -1245,8 +1245,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_base_p_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 264, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 264, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_base_p_u64(uint64x2_t addr, uint64x2_t value, mve_pred16_t p) @@ -1290,8 +1290,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP2]]) // CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 // CHECK-NEXT: ret void // @@ -1308,8 +1308,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 136, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 136, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP2]]) // CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 // CHECK-NEXT: ret void // @@ -1357,8 +1357,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_offset_p_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) @@ -1373,8 +1373,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_offset_p_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) @@ -1417,8 +1417,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_s64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_shifted_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) @@ -1433,8 +1433,8 @@ // CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_u64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <2 x i1> [[TMP1]]) // CHECK-NEXT: ret void // void test_vstrdq_scatter_shifted_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c @@ -12,8 +12,8 @@ // CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x half>, <8 x half> } [[TMP0]], 0 // CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] undef, <8 x half> [[TMP1]], 0, 0 // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x half>, <8 x half> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] %2, <8 x half> [[TMP3]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] %4 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP2]], <8 x half> [[TMP3]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP4]] // float16x8x2_t test_vld2q_f16(const float16_t *addr) { @@ -30,12 +30,12 @@ // CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 0 // CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T:%.*]] undef, <16 x i8> [[TMP1]], 0, 0 // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %2, <16 x i8> [[TMP3]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[TMP2]], <16 x i8> [[TMP3]], 0, 1 // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 2 -// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %4, <16 x i8> [[TMP5]], 0, 2 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[TMP4]], <16 x i8> [[TMP5]], 0, 2 // CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 3 -// CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %6, <16 x i8> [[TMP7]], 0, 3 -// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] %8 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] [[TMP8]] // uint8x16x4_t test_vld4q_u8(const uint8_t *addr) { @@ -48,8 +48,8 @@ // CHECK-LABEL: @test_vst2q_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] %value.coerce, 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] %value.coerce, 0, 1 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[VALUE_COERCE]], 0, 1 // CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[ADDR:%.*]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0) // CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[ADDR]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1) // CHECK-NEXT: ret void @@ -65,10 +65,10 @@ // CHECK-LABEL: @test_vst4q_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] %value.coerce, 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 1 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 2 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 3 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 1 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 2 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 3 // CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR:%.*]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 0) // CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 1) // CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 2) @@ -86,8 +86,8 @@ // CHECK-LABEL: @test_vst2q_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] %value.coerce, 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] %value.coerce, 0, 1 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[VALUE_COERCE]], 0, 1 // CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* [[ADDR:%.*]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0) // CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* [[ADDR]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1) // CHECK-NEXT: ret void diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vldr.c b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vldr.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c @@ -37,8 +37,8 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 656, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 656, <2 x i1> [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 // CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c @@ -97,8 +97,8 @@ // CHECK-LABEL: @test_vmullbq_int_m_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vmullbq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -161,8 +161,8 @@ // CHECK-LABEL: @test_vmullbq_int_x_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, <4 x i1> [[TMP1]], <2 x i64> undef) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, <2 x i1> [[TMP1]], <2 x i64> undef) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c @@ -97,8 +97,8 @@ // CHECK-LABEL: @test_vmulltq_int_m_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vmulltq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) @@ -161,8 +161,8 @@ // CHECK-LABEL: @test_vmulltq_int_x_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, <4 x i1> [[TMP1]], <2 x i64> undef) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, <2 x i1> [[TMP1]], <2 x i64> undef) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // uint64x2_t test_vmulltq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c @@ -50,8 +50,8 @@ // CHECK-LABEL: @test_vqdmullbq_m_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmullbq_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { @@ -114,8 +114,8 @@ // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 0, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmullbq_m_n_s32(int64x2_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c @@ -50,8 +50,8 @@ // CHECK-LABEL: @test_vqdmulltq_m_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmulltq_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { @@ -114,8 +114,8 @@ // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[DOTSPLAT]], i32 1, <2 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]]) // CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmulltq_m_n_s32(int64x2_t inactive, int32x4_t a, int32_t b, mve_pred16_t p) { diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -349,13 +349,8 @@ bool requiresFloat() const override { return false; }; bool requiresMVE() const override { return true; } std::string llvmName() const override { - // Use <4 x i1> instead of <2 x i1> for two-lane vector types. See - // the comment in llvm/lib/Target/ARM/ARMInstrMVE.td for further - // explanation. - unsigned ModifiedLanes = (Lanes == 2 ? 4 : Lanes); - - return "llvm::FixedVectorType::get(Builder.getInt1Ty(), " + - utostr(ModifiedLanes) + ")"; + return "llvm::FixedVectorType::get(Builder.getInt1Ty(), " + utostr(Lanes) + + ")"; } static bool classof(const Type *T) { diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -809,8 +809,7 @@ def int_arm_mve_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>; -// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type -def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_arm_mve_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>; // v8.3-A Floating-point complex add def int_arm_neon_vcadd_rot90 : Neon_2Arg_Intrinsic; diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2960,10 +2960,6 @@ if (auto *Op = dyn_cast(Operands[0])) { unsigned Lanes = FVTy->getNumElements(); uint64_t Limit = Op->getZExtValue(); - // vctp64 are currently modelled as returning a v4i1, not a v2i1. Make - // sure we get the limit right in that case and set all relevant lanes. - if (IntrinsicID == Intrinsic::arm_mve_vctp64) - Limit *= 2; SmallVector NCs; for (unsigned i = 0; i < Lanes; i++) { diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -702,6 +702,31 @@ NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); return true; } + + if (Name == "arm.mve.vctp64" && + cast(F->getReturnType())->getNumElements() == 4) { + // A vctp64 returning a v4i1 is converted to return a v2i1. Rename the + // function and deal with it below in UpgradeIntrinsicCall. + rename(F); + return true; + } + // These too are changed to accept a v2i1 insteead of the old v4i1. + if (Name == "arm.mve.mull.int.predicated.v2i64.v4i32.v4i1" || + Name == "arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1" || + Name == "arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" || + Name == "arm.cde.vcx1q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx1qa.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx2q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx2qa.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx3q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1") + return true; + break; } @@ -1826,6 +1851,9 @@ bool IsNVVM = Name.startswith("nvvm."); if (IsNVVM) Name = Name.substr(5); + bool IsARM = Name.startswith("arm."); + if (IsARM) + Name = Name.substr(4); if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); @@ -3649,6 +3677,84 @@ F->getParent(), Intrinsic::convert_from_fp16, {Builder.getFloatTy()}), CI->getArgOperand(0), "h2f"); + } else if (IsARM && Name == "mve.vctp64.old") { + // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the + // correct type. + Value *VCTP = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64), + CI->getArgOperand(0), CI->getName()); + Value *C1 = Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 2, false)}), + VCTP); + Rep = Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_i2v, + {VectorType::get(Builder.getInt1Ty(), 4, false)}), + C1); + } else if (IsARM && + (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" || + Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" || + Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" || + Name == "cde.vcx1q.predicated.v2i64.v4i1" || + Name == "cde.vcx1qa.predicated.v2i64.v4i1" || + Name == "cde.vcx2q.predicated.v2i64.v4i1" || + Name == "cde.vcx2qa.predicated.v2i64.v4i1" || + Name == "cde.vcx3q.predicated.v2i64.v4i1" || + Name == "cde.vcx3qa.predicated.v2i64.v4i1")) { + std::vector Tys; + unsigned ID = CI->getIntrinsicID(); + Type *V2I1Ty = FixedVectorType::get(Builder.getInt1Ty(), 2); + if (ID == Intrinsic::arm_mve_mull_int_predicated || + ID == Intrinsic::arm_mve_vqdmull_predicated || + ID == Intrinsic::arm_mve_vldr_gather_base_predicated) + Tys = {CI->getType(), CI->getOperand(0)->getType(), V2I1Ty}; + else if (ID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || + ID == Intrinsic::arm_mve_vstr_scatter_base_predicated || + ID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated) + Tys = {CI->getOperand(0)->getType(), CI->getOperand(0)->getType(), + V2I1Ty}; + else if (ID == Intrinsic::arm_mve_vldr_gather_offset_predicated) + Tys = {CI->getType(), CI->getOperand(0)->getType(), + CI->getOperand(1)->getType(), V2I1Ty}; + else if (ID == Intrinsic::arm_mve_vstr_scatter_offset_predicated) + Tys = {CI->getOperand(0)->getType(), CI->getOperand(1)->getType(), + CI->getOperand(2)->getType(), V2I1Ty}; + else if (ID == Intrinsic::arm_cde_vcx1q_predicated || + ID == Intrinsic::arm_cde_vcx1qa_predicated || + ID == Intrinsic::arm_cde_vcx2q_predicated || + ID == Intrinsic::arm_cde_vcx2qa_predicated || + ID == Intrinsic::arm_cde_vcx3q_predicated || + ID == Intrinsic::arm_cde_vcx3qa_predicated) + Tys = {CI->getOperand(1)->getType(), V2I1Ty}; + else + llvm_unreachable("Unhandled Intrinsic!"); + + std::vector Ops; + for (Value *Op : CI->args()) { + Type *Ty = Op->getType(); + if (Ty->getScalarSizeInBits() == 1) { + Value *C1 = Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 4, false)}), + Op); + Op = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), + Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), + C1); + } + Ops.push_back(Op); + } + + Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + Rep = Builder.CreateCall(Fn, Ops, CI->getName()); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -254,13 +254,6 @@ // An LLVM ValueType representing a corresponding vector of // predicate bits, for use in ISel patterns that handle an IR // intrinsic describing the predicated form of the instruction. - // - // Usually, for a vector of N things, this will be vNi1. But for - // vectors of 2 values, we make an exception, and use v4i1 instead - // of v2i1. Rationale: MVE codegen doesn't support doing all the - // auxiliary operations on v2i1 (vector shuffles etc), and also, - // there's no MVE compare instruction that will _generate_ v2i1 - // directly. ValueType Pred = pred; // Same as Pred but for DblVec rather than Vec. @@ -294,25 +287,25 @@ // Integer vector types that don't treat signed and unsigned differently. def MVE_v16i8 : MVEVectorVTInfo; def MVE_v8i16 : MVEVectorVTInfo; -def MVE_v4i32 : MVEVectorVTInfo; -def MVE_v2i64 : MVEVectorVTInfo; +def MVE_v4i32 : MVEVectorVTInfo; +def MVE_v2i64 : MVEVectorVTInfo; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. def MVE_v16s8 : MVEVectorVTInfo; def MVE_v8s16 : MVEVectorVTInfo; -def MVE_v4s32 : MVEVectorVTInfo; -def MVE_v2s64 : MVEVectorVTInfo; +def MVE_v4s32 : MVEVectorVTInfo; +def MVE_v2s64 : MVEVectorVTInfo; def MVE_v16u8 : MVEVectorVTInfo; def MVE_v8u16 : MVEVectorVTInfo; -def MVE_v4u32 : MVEVectorVTInfo; -def MVE_v2u64 : MVEVectorVTInfo; +def MVE_v4u32 : MVEVectorVTInfo; +def MVE_v2u64 : MVEVectorVTInfo; // FP vector types. def MVE_v8f16 : MVEVectorVTInfo; -def MVE_v4f32 : MVEVectorVTInfo; -def MVE_v2f64 : MVEVectorVTInfo; +def MVE_v4f32 : MVEVectorVTInfo; +def MVE_v2f64 : MVEVectorVTInfo; // Polynomial vector types. def MVE_v16p8 : MVEVectorVTInfo; diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -213,7 +213,8 @@ auto *TC = SE->getSCEV(TripCount); int VectorWidth = cast(ActiveLaneMask->getType())->getNumElements(); - if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) + if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 && + VectorWidth != 16) return false; ConstantInt *ConstElemCount = nullptr; @@ -371,15 +372,10 @@ switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); + case 2: VCTPID = Intrinsic::arm_mve_vctp64; break; case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; - - // FIXME: vctp64 currently not supported because the predicate - // vector wants to be <2 x i1>, but v2i1 is not a legal MVE - // type, so problems happen at isel time. - // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics - // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); Value *VCTPCall = Builder.CreateCall(VCTP, Processed); diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -442,7 +442,7 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) { ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB5_3 @@ -450,86 +450,52 @@ ; CHECK-NEXT: adds r0, r2, #1 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r0, r0, #1 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: subs r0, #2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r2 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: add.w lr, r3, r0, lsr #1 +; CHECK-NEXT: add.w r0, r3, r0, lsr #1 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2[2], q2[0], r12, r12 -; CHECK-NEXT: vmov r6, r7, d3 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: add.w r12, r12, #2 -; CHECK-NEXT: vmov r0, r2, d5 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: adc r2, r2, #0 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r4, r5, d5 -; CHECK-NEXT: subs r6, r4, r6 -; CHECK-NEXT: eor.w r0, r0, r4 -; CHECK-NEXT: sbcs r5, r7 -; CHECK-NEXT: cset r5, lo -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: cset r5, ne -; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: ands r0, r5 -; CHECK-NEXT: vmov r5, r6, d2 -; CHECK-NEXT: rsbs r2, r0, #0 -; CHECK-NEXT: vmov r0, r4, d4 -; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: subs r5, r0, r5 -; CHECK-NEXT: sbcs r4, r6 -; CHECK-NEXT: cset r4, lo -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: cset r4, ne -; CHECK-NEXT: eors r0, r3 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: ands r0, r4 -; CHECK-NEXT: sub.w r4, r1, #8 -; CHECK-NEXT: rsbs r5, r0, #0 +; CHECK-NEXT: vctp.64 r2 +; CHECK-NEXT: @ implicit-def: $q0 +; CHECK-NEXT: subs r2, #2 +; CHECK-NEXT: vmrs r3, p0 +; CHECK-NEXT: and r0, r3, #1 +; CHECK-NEXT: ubfx r3, r3, #8, #1 +; CHECK-NEXT: rsb.w r12, r0, #0 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: bfi r0, r5, #0, #1 -; CHECK-NEXT: bfi r0, r2, #1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r0, r12, #0, #1 +; CHECK-NEXT: sub.w r12, r1, #8 +; CHECK-NEXT: bfi r0, r3, #1, #1 ; CHECK-NEXT: lsls r3, r0, #31 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r3, [r4] -; CHECK-NEXT: vmovne.32 q2[0], r3 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: ldrne.w r3, [r12] +; CHECK-NEXT: vmovne.32 q0[0], r3 ; CHECK-NEXT: lsls r0, r0, #30 -; CHECK-NEXT: bfi r3, r5, #0, #8 ; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrmi r0, [r4, #4] -; CHECK-NEXT: vmovmi.32 q2[2], r0 -; CHECK-NEXT: bfi r3, r2, #8, #8 +; CHECK-NEXT: ldrmi.w r0, [r12, #4] +; CHECK-NEXT: vmovmi.32 q0[2], r0 +; CHECK-NEXT: vmrs r3, p0 ; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: ubfx r3, r3, #8, #1 +; CHECK-NEXT: rsb.w r12, r0, #0 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: bfi r0, r2, #0, #1 -; CHECK-NEXT: ubfx r2, r3, #8, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r0, r2, #1, #1 -; CHECK-NEXT: lsls r2, r0, #31 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r0, r12, #0, #1 +; CHECK-NEXT: bfi r0, r3, #1, #1 +; CHECK-NEXT: lsls r3, r0, #31 ; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r2, s8 -; CHECK-NEXT: strne r2, [r1] +; CHECK-NEXT: vmovne r3, s0 +; CHECK-NEXT: strne r3, [r1] ; CHECK-NEXT: lsls r0, r0, #30 ; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r0, s10 +; CHECK-NEXT: vmovmi r0, s2 ; CHECK-NEXT: strmi r0, [r1, #4] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %cmp9.not = icmp eq i8 %m, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/Thumb2/cde-vec.ll b/llvm/test/CodeGen/Thumb2/cde-vec.ll --- a/llvm/test/CodeGen/Thumb2/cde-vec.ll +++ b/llvm/test/CodeGen/Thumb2/cde-vec.ll @@ -116,11 +116,12 @@ declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 immarg, <8 x i16>, i32 immarg, <8 x i1>) declare <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 immarg, <16 x i8>, i32 immarg, <16 x i1>) declare <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 immarg, <4 x i32>, <16 x i8>, i32 immarg, <4 x i1>) declare <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, i32 immarg, <4 x i1>) -declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v2i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <2 x i1>) declare <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) define arm_aapcs_vfpcc <8 x i16> @test_vcx1q_m(<8 x i16> %inactive, i16 zeroext %p) { @@ -191,8 +192,8 @@ entry: %0 = bitcast <4 x float> %n to <16 x i8> %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v2i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <2 x i1> %2) ret <2 x i64> %3 } diff --git a/llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll b/llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll --- a/llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll @@ -94,8 +94,8 @@ ; CHECK: MVE_VSTRD64_qi %1, %0, 888, 1, killed %3, $noreg :: (store (s128), align 1) entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <2 x i1> %1) ret void } @@ -115,14 +115,15 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 248, <2 x i64> %value, <2 x i1> %2) store <2 x i64> %3, <2 x i64>* %addr, align 8 ret void } declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8*, <8 x i16>, i32, i32, i32) declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8*, <4 x i32>, i32, i32, i32, <4 x i1>) declare <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64>, i32) @@ -132,6 +133,6 @@ declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8*, <16 x i8>, <16 x i8>, i32, i32, <16 x i1>) declare void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) -declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>) declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) -declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll @@ -4,7 +4,7 @@ declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) -declare <4 x i1> @llvm.arm.mve.vctp64(i32) +declare <2 x i1> @llvm.arm.mve.vctp64(i32) declare i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) @@ -116,8 +116,8 @@ ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: bx lr entry: - %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) - %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %0 = call <2 x i1> @llvm.arm.mve.vctp64(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %0) %2 = trunc i32 %1 to i16 ret i16 %2 } @@ -125,17 +125,23 @@ define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) { ; CHECK-LABEL: test_vctp64q_m: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vctpt.64 r0 +; CHECK-NEXT: vctp.64 r0 ; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: ands r1, r0 +; CHECK-NEXT: and r0, r1, #1 +; CHECK-NEXT: ubfx r1, r1, #8, #1 +; CHECK-NEXT: rsbs r2, r0, #0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r0, r2, #0, #8 +; CHECK-NEXT: bfi r0, r1, #8, #8 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) - %3 = and <4 x i1> %1, %2 - %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i1> @llvm.arm.mve.vctp64(i32 %a) + %3 = and <2 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %3) %5 = trunc i32 %4 to i16 ret i16 %5 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll @@ -245,15 +245,16 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 664, <2 x i1> %2) %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 store <2 x i64> %4, <2 x i64>* %addr, align 8 %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 ret <2 x i64> %5 } -declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i1>) define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %addr, i16 zeroext %p) { ; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: @@ -267,8 +268,8 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 656, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 656, <2 x i1> %2) %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 store <2 x i64> %4, <2 x i64>* %addr, align 8 %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 @@ -285,12 +286,12 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 888, <2 x i1> %1) ret <2 x i64> %2 } -declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i1>) define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_u64(<2 x i64> %addr, i16 zeroext %p) { ; CHECK-LABEL: test_vldrdq_gather_base_z_u64: @@ -302,8 +303,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 -1000, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 -1000, <2 x i1> %1) ret <2 x i64> %2 } @@ -341,12 +342,12 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <2 x i1> %1) ret <2 x i64> %2 } -declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64*, <2 x i64>, i32, i32, i32, <2 x i1>) define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { ; CHECK-LABEL: test_vldrdq_gather_offset_z_u64: @@ -358,8 +359,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <2 x i1> %1) ret <2 x i64> %2 } @@ -395,8 +396,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <2 x i1> %1) ret <2 x i64> %2 } @@ -410,8 +411,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <2 x i1> %1) ret <2 x i64> %2 } @@ -1218,12 +1219,12 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <2 x i1> %1) ret void } -declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>) define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_u64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { ; CHECK-LABEL: test_vstrdq_scatter_base_p_u64: @@ -1234,8 +1235,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <2 x i1> %1) ret void } @@ -1273,13 +1274,13 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 248, <2 x i64> %value, <2 x i1> %2) store <2 x i64> %3, <2 x i64>* %addr, align 8 ret void } -declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>) define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_u64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { ; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_u64: @@ -1293,8 +1294,8 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 136, <2 x i64> %value, <4 x i1> %2) + %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 136, <2 x i64> %value, <2 x i1> %2) store <2 x i64> %3, <2 x i64>* %addr, align 8 ret void } @@ -1338,12 +1339,12 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <2 x i1> %1) ret void } -declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>) +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <2 x i1>) define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { ; CHECK-LABEL: test_vstrdq_scatter_offset_p_u64: @@ -1354,8 +1355,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <2 x i1> %1) ret void } @@ -1390,8 +1391,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <2 x i1> %1) ret void } @@ -1404,8 +1405,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <2 x i1> %1) ret void } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/v2i1-upgrade.ll @@ -0,0 +1,332 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -o - %s | FileCheck %s + +declare <4 x i1> @llvm.arm.mve.vctp64(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) +declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) +declare <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>) +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>) + +declare <2 x i64> @llvm.arm.cde.vcx1q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx1qa.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx2q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx2qa.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx3qa.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) { +; CHECK-LABEL: @test_vctp64q( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +; CHECK-NEXT: ret i16 [[TMP4]] +; +entry: + %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: @test_vctp64q_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16 +; CHECK-NEXT: ret i16 [[TMP7]] +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %3 = and <4 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: @test_vmullbq_int_m_s32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <2 x i1> [[TMP3]], <2 x i64> [[INACTIVE:%.*]]) +; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <4 x i1> %1, <2 x i64> %inactive) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmullbq_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: @test_vqdmullbq_m_s32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <2 x i1> [[TMP3]], <2 x i64> [[INACTIVE:%.*]]) +; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_s64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: @test_vldrdq_gather_base_z_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i1> [[TMP3]]) +; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: @test_vldrdq_gather_base_wb_z_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 664, <2 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP5]], 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[ADDR]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP5]], 0 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: @test_vldrdq_gather_offset_z_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <2 x i1> [[TMP3]]) +; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: @test_vstrdq_scatter_base_p_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <2 x i1> [[TMP4]]) +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[ADDR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: @test_vstrdq_scatter_offset_p_s64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v2i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <2 x i1> [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +define <8 x i16> @test_vcx1q_m(<2 x i64> %inactive, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx1q_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.cde.vcx1q.predicated.v2i64.v2i1(i32 0, <2 x i64> [[INACTIVE:%.*]], i32 1111, <2 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[TMP5]] +; +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.cde.vcx1q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, i32 1111, <4 x i1> %1) + %3 = bitcast <2 x i64> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <16 x i8> @test_vcx1qa_m(<2 x i64> %acc, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx1qa_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.cde.vcx1qa.predicated.v2i64.v2i1(i32 0, <2 x i64> [[ACC:%.*]], i32 1112, <2 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[TMP5]] +; +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.cde.vcx1qa.predicated.v2i64.v4i1(i32 0, <2 x i64> %acc, i32 1112, <4 x i1> %1) + %3 = bitcast <2 x i64> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <4 x i32> @test_vcx2q_m(<2 x i64> %inactive, <4 x float> %n, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx2q_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.arm.cde.vcx2q.predicated.v2i64.v2i1(i32 0, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], i32 111, <2 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP6]] +; +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call <2 x i64> @llvm.arm.cde.vcx2q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, i32 111, <4 x i1> %2) + %4 = bitcast <2 x i64> %3 to <4 x i32> + ret <4 x i32> %4 +} + +define <4 x float> @test_vcx2qa_m(<2 x i64> %acc, <8 x half> %n, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx2qa_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.arm.cde.vcx2qa.predicated.v2i64.v2i1(i32 0, <2 x i64> [[ACC:%.*]], <16 x i8> [[TMP0]], i32 112, <2 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <4 x float> +; CHECK-NEXT: ret <4 x float> [[TMP6]] +; +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call <2 x i64> @llvm.arm.cde.vcx2qa.predicated.v2i64.v4i1(i32 0, <2 x i64> %acc, <16 x i8> %0, i32 112, <4 x i1> %2) + %4 = bitcast <2 x i64> %3 to <4 x float> + ret <4 x float> %4 +} + +define <2 x i64> @test_vcx3q_m(<2 x i64> %inactive, <4 x float> %n, <16 x i8> %m, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx3q_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v2i1(i32 0, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[M:%.*]], i32 11, <2 x i1> [[TMP4]]) +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <4 x i1> %2) + ret <2 x i64> %3 +} + +define <8 x half> @test_vcx3qa_m(<2 x i64> %inactive, <8 x half> %n, <4 x i32> %m, i16 zeroext %p) { +; CHECK-LABEL: @test_vcx3qa_m( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3qa.predicated.v2i64.v2i1(i32 0, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12, <2 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <8 x half> +; CHECK-NEXT: ret <8 x half> [[TMP7]] +; +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = bitcast <4 x i32> %m to <16 x i8> + %2 = zext i16 %p to i32 + %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + %4 = tail call <2 x i64> @llvm.arm.cde.vcx3qa.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %1, i32 12, <4 x i1> %3) + %5 = bitcast <2 x i64> %4 to <8 x half> + ret <8 x half> %5 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll @@ -49,8 +49,8 @@ entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 - %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) - %3 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 656, <4 x i1> %2) + %2 = tail call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1) + %3 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 656, <2 x i1> %2) %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 store <2 x i64> %4, <2 x i64>* %addr, align 8 %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 @@ -58,5 +58,6 @@ } declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) -declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll @@ -95,12 +95,13 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <4 x i1> %1, <2 x i64> %inactive) + %1 = tail call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } -declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1 +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) +declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32>, <4 x i32>, i32, i32, <2 x i1>, <2 x i64>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmullbq_poly_m_p8: @@ -156,8 +157,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0, <4 x i1> %1, <2 x i64> undef) + %1 = tail call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0, <2 x i1> %1, <2 x i64> undef) ret <2 x i64> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll @@ -95,12 +95,13 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 1, <4 x i1> %1, <2 x i64> %inactive) + %1 = tail call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 1, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } -declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1 +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) +declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32>, <4 x i32>, i32, i32, <2 x i1>, <2 x i64>) #1 define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 { ; CHECK-LABEL: test_vmulltq_poly_m_p8: @@ -156,8 +157,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1, <4 x i1> %1, <2 x i64> undef) + %1 = tail call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1, <2 x i1> %1, <2 x i64> undef) ret <2 x i64> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmull.ll @@ -2,11 +2,12 @@ ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <4 x i32> @llvm.arm.mve.vqdmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) -declare <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) +declare <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32>, <4 x i32>, i32, <2 x i1>, <2 x i64>) define arm_aapcs_vfpcc <4 x i32> @test_vqdmullbq_s16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_vqdmullbq_s16: @@ -52,8 +53,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 0, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } @@ -109,8 +110,8 @@ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <4 x i1> %1, <2 x i64> %inactive) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %.splat, i32 0, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } @@ -158,8 +159,8 @@ ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %b, i32 1, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } @@ -215,7 +216,7 @@ %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %0 = zext i16 %p to i32 - %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <4 x i1> %1, <2 x i64> %inactive) + %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vqdmull.predicated.v2i64.v4i32.v2i1(<4 x i32> %a, <4 x i32> %.splat, i32 1, <2 x i1> %1, <2 x i64> %inactive) ret <2 x i64> %2 } diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll @@ -52,6 +52,22 @@ ret void } +define void @vctp64(i32 %arg, <2 x i64> *%in, <2 x i64>* %out) { +; CHECK-LABEL: vctp64: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: bx lr + %pred = call <2 x i1> @llvm.arm.mve.vctp64(i32 %arg) + %ld = load <2 x i64>, <2 x i64>* %in + %res = select <2 x i1> %pred, <2 x i64> %ld, <2 x i64> zeroinitializer + store <2 x i64> %res, <2 x i64>* %out + ret void +} define arm_aapcs_vfpcc <4 x i32> @vcmp_ult_v4i32(i32 %n, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vcmp_ult_v4i32: @@ -208,3 +224,4 @@ declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <2 x i1> @llvm.arm.mve.vctp64(i32) diff --git a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll --- a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll +++ b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll @@ -3,10 +3,12 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +declare i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>) +declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) @@ -14,6 +16,17 @@ ; Round-trip conversions from predicate vector to i32 back to the same ; size of vector should be eliminated. +define <2 x i1> @v2i2v_2(<2 x i1> %vin) { +; CHECK-LABEL: @v2i2v_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x i1> [[VIN:%.*]] +; +entry: + %int = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %vin) + %vout = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %int) + ret <2 x i1> %vout +} + define <4 x i1> @v2i2v_4(<4 x i1> %vin) { ; CHECK-LABEL: @v2i2v_4( ; CHECK-NEXT: entry: @@ -50,10 +63,23 @@ ; Conversions from a predicate vector to i32 and then to a _different_ ; size of predicate vector should be left alone. +define <16 x i1> @v2i2v_2_16(<2 x i1> %vin) { +; CHECK-LABEL: @v2i2v_2_16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INT:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[VIN:%.*]]), !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[VOUT:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[INT]]) +; CHECK-NEXT: ret <16 x i1> [[VOUT]] +; +entry: + %int = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %vin) + %vout = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %int) + ret <16 x i1> %vout +} + define <16 x i1> @v2i2v_4_16(<4 x i1> %vin) { ; CHECK-LABEL: @v2i2v_4_16( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INT:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[VIN:%.*]]), !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[INT:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[VIN:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[VOUT:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[INT]]) ; CHECK-NEXT: ret <16 x i1> [[VOUT]] ; @@ -92,6 +118,17 @@ ; Round-trip conversions from i32 to predicate vector back to i32 ; should be eliminated. +define i32 @i2v2i_2(i32 %iin) { +; CHECK-LABEL: @i2v2i_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 [[IIN:%.*]] +; +entry: + %vec = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %iin) + %iout = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %vec) + ret i32 %iout +} + define i32 @i2v2i_4(i32 %iin) { ; CHECK-LABEL: @i2v2i_4( ; CHECK-NEXT: entry: @@ -242,6 +279,19 @@ ; a complement of the vector itself. (Rationale: this is likely to ; allow it to be code-generated as MVE VPNOT.) +define <2 x i1> @vpnot_2(<2 x i1> %vin) { +; CHECK-LABEL: @vpnot_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VOUT:%.*]] = xor <2 x i1> [[VIN:%.*]], +; CHECK-NEXT: ret <2 x i1> [[VOUT]] +; +entry: + %int = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %vin) + %flipped = xor i32 %int, 65535 + %vout = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %flipped) + ret <2 x i1> %vout +} + define <4 x i1> @vpnot_4(<4 x i1> %vin) { ; CHECK-LABEL: @vpnot_4( ; CHECK-NEXT: entry: @@ -284,6 +334,21 @@ ; And this still works even if the i32 is narrowed to i16 and back on ; opposite sides of the xor. +define <2 x i1> @vpnot_narrow_2(<2 x i1> %vin) { +; CHECK-LABEL: @vpnot_narrow_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VOUT:%.*]] = xor <2 x i1> [[VIN:%.*]], +; CHECK-NEXT: ret <2 x i1> [[VOUT]] +; +entry: + %int = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> %vin) + %narrow = trunc i32 %int to i16 + %flipped_narrow = xor i16 %narrow, -1 + %flipped = zext i16 %flipped_narrow to i32 + %vout = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %flipped) + ret <2 x i1> %vout +} + define <4 x i1> @vpnot_narrow_4(<4 x i1> %vin) { ; CHECK-LABEL: @vpnot_narrow_4( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/ARM/mve-vctp.ll b/llvm/test/Transforms/InstSimplify/ConstProp/ARM/mve-vctp.ll --- a/llvm/test/Transforms/InstSimplify/ConstProp/ARM/mve-vctp.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/ARM/mve-vctp.ll @@ -209,59 +209,59 @@ -define <4 x i1> @vctp64_0() { +define <2 x i1> @vctp64_0() { ; CHECK-LABEL: @vctp64_0( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <4 x i1> zeroinitializer +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; entry: - %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 0) - ret <4 x i1> %int + %int = call <2 x i1> @llvm.arm.mve.vctp64(i32 0) + ret <2 x i1> %int } -define <4 x i1> @vctp64_1() { +define <2 x i1> @vctp64_1() { ; CHECK-LABEL: @vctp64_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <4 x i1> +; CHECK-NEXT: ret <2 x i1> ; entry: - %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 1) - ret <4 x i1> %int + %int = call <2 x i1> @llvm.arm.mve.vctp64(i32 1) + ret <2 x i1> %int } -define <4 x i1> @vctp64_2() { +define <2 x i1> @vctp64_2() { ; CHECK-LABEL: @vctp64_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <4 x i1> +; CHECK-NEXT: ret <2 x i1> ; entry: - %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 2) - ret <4 x i1> %int + %int = call <2 x i1> @llvm.arm.mve.vctp64(i32 2) + ret <2 x i1> %int } -define <4 x i1> @vctp64_100() { +define <2 x i1> @vctp64_100() { ; CHECK-LABEL: @vctp64_100( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <4 x i1> +; CHECK-NEXT: ret <2 x i1> ; entry: - %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 100) - ret <4 x i1> %int + %int = call <2 x i1> @llvm.arm.mve.vctp64(i32 100) + ret <2 x i1> %int } -define <4 x i1> @vctp64_m1() { +define <2 x i1> @vctp64_m1() { ; CHECK-LABEL: @vctp64_m1( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <4 x i1> +; CHECK-NEXT: ret <2 x i1> ; entry: - %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 -1) - ret <4 x i1> %int + %int = call <2 x i1> @llvm.arm.mve.vctp64(i32 -1) + ret <2 x i1> %int } -declare <4 x i1> @llvm.arm.mve.vctp64(i32) +declare <2 x i1> @llvm.arm.mve.vctp64(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) declare <16 x i1> @llvm.arm.mve.vctp8(i32) diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll @@ -197,22 +197,24 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] ; CHECK-NEXT: br label [[TMP11:%.*]] ; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]]) -; CHECK-NEXT: [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0 -; CHECK-NEXT: [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]]) -; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4 -; CHECK-NEXT: [[TMP21]] = add i32 [[TMP12]], -4 -; CHECK-NEXT: br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]] -; CHECK: 22: -; CHECK-NEXT: [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]]) -; CHECK-NEXT: [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float -; CHECK-NEXT: [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]]) -; CHECK-NEXT: ret float [[TMP25]] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP23:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP21:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP15:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP17]]) +; CHECK-NEXT: [[TMP19]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 0 +; CHECK-NEXT: [[TMP21]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP20]], <4 x i1> [[TMP17]], <4 x float> [[TMP13]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP23]] = add i32 [[TMP12]], -4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP11]], label [[TMP24:%.*]] +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP21]]) +; CHECK-NEXT: [[TMP26:%.*]] = sitofp i32 [[TMP25]] to float +; CHECK-NEXT: [[TMP27:%.*]] = tail call float @llvm.fabs.f32(float [[TMP26]]) +; CHECK-NEXT: ret float [[TMP27]] ; %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) %4 = extractvalue { <4 x i32>, i32 } %3, 0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll @@ -197,22 +197,24 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] ; CHECK-NEXT: br label [[TMP11:%.*]] ; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ] -; CHECK-NEXT: [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]]) -; CHECK-NEXT: [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0 -; CHECK-NEXT: [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]]) -; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4 -; CHECK-NEXT: [[TMP21]] = add i32 [[TMP12]], -4 -; CHECK-NEXT: br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]] -; CHECK: 22: -; CHECK-NEXT: [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]]) -; CHECK-NEXT: [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float -; CHECK-NEXT: [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]]) -; CHECK-NEXT: ret float [[TMP25]] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP23:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP21:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ] +; CHECK-NEXT: [[TMP15:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP17]]) +; CHECK-NEXT: [[TMP19]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 0 +; CHECK-NEXT: [[TMP21]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP20]], <4 x i1> [[TMP17]], <4 x float> [[TMP13]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP23]] = add i32 [[TMP12]], -4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP11]], label [[TMP24:%.*]] +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP21]]) +; CHECK-NEXT: [[TMP26:%.*]] = sitofp i32 [[TMP25]] to float +; CHECK-NEXT: [[TMP27:%.*]] = tail call float @llvm.fabs.f32(float [[TMP26]]) +; CHECK-NEXT: ret float [[TMP27]] ; %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) %4 = extractvalue { <4 x i32>, i32 } %3, 0