diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -1445,6 +1445,33 @@ "vrmlldavha">; } +multiclass VADDV { + defvar accArg = !if(acc, (args Scalar:$acc), (args)); + defvar predArg = !if(pred, (args Predicate:$pred), (args)); + defvar intrinsic = !if(pred, + IRInt, + IRInt); + defvar intCG = !con((intrinsic $v, (unsignedflag Scalar)), + !if(pred, (? $pred), (?))); + defvar accCG = !if(acc, (add intCG, $acc), intCG); + + def "": Intrinsic; +} + +let params = T.Int in { +defm vaddvq : VADDV<0, 0, "addv", Scalar32>; +defm vaddvaq : VADDV<1, 0, "addv", Scalar32>; +defm vaddvq_p : VADDV<0, 1, "addv", Scalar32>; +defm vaddvaq_p : VADDV<1, 1, "addv", Scalar32>; +} + +let params = [s32, u32] in { +defm vaddlvq : VADDV<0, 0, "addlv", Scalar64>; +defm vaddlvaq : VADDV<1, 0, "addlv", Scalar64>; +defm vaddlvq_p : VADDV<0, 1, "addlv", Scalar64>; +defm vaddlvaq_p : VADDV<1, 1, "addlv", Scalar64>; +} + let params = T.Int in { def vabavq : Intrinsic (unsignedflag Scalar), $a, $b, $c)>; diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c @@ -0,0 +1,470 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vaddvq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s8(int8x16_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s8(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s16(int16x8_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s16(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s32(int32x4_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u8(uint8x16_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u8(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u16(uint16x8_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u16(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u32(uint32x4_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s8(int32_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s16(int32_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s32(int32_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u8(uint32_t a, uint8x16_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u16(uint32_t a, uint16x8_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u32(uint32_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s8(int8x16_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s8(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s16(int16x8_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s16(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s32(int32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u8(uint8x16_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u8(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u16(uint16x8_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u16(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u32(uint32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s8(int32_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s16(int32_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u8(uint32_t a, uint8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u16(uint32_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i64 [[TMP0]] +// +int64_t test_vaddlvq_s32(int32x4_t a) { +#ifdef POLYMORPHIC + return vaddlvq(a); +#else /* POLYMORPHIC */ + return vaddlvq_s32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_vaddlvq_u32(uint32x4_t a) { +#ifdef POLYMORPHIC + return vaddlvq(a); +#else /* POLYMORPHIC */ + return vaddlvq_u32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP1]] +// +int64_t test_vaddlvaq_s32(int64_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vaddlvaq(a, b); +#else /* POLYMORPHIC */ + return vaddlvaq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP1]] +// +uint64_t test_vaddlvaq_u32(uint64_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vaddlvaq(a, b); +#else /* POLYMORPHIC */ + return vaddlvaq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i64 [[TMP2]] +// +int64_t test_vaddlvq_p_s32(int32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddlvq_p_s32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vaddlvq_p_u32(uint32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddlvq_p_u32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP3]] +// +int64_t test_vaddlvaq_p_s32(int64_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddlvaq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vaddlvaq_p_u32(uint64_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddlvaq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -911,6 +911,11 @@ defm int_arm_mve_min: MVE_minmaxv; defm int_arm_mve_max: MVE_minmaxv; +defm int_arm_mve_addv: MVEPredicated<[llvm_i32_ty], + [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>; +defm int_arm_mve_addlv: MVEPredicated<[llvm_i64_ty], + [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>; + // Intrinsic with a predicated and a non-predicated case. The predicated case // has two additional parameters: inactive (the value for inactive lanes, can // be undef) and predicate. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -207,12 +207,16 @@ VMULLu, // ...unsigned // MVE reductions - VADDVs, - VADDVu, - VADDLVs, - VADDLVu, - VADDLVAs, - VADDLVAu, + VADDVs, // sign- or zero-extend the elements of a vector to i32, + VADDVu, // add them all together, and return an i32 of their sum + VADDLVs, // sign- or zero-extend elements to i64 and sum, returning + VADDLVu, // the low and high 32-bit halves of the sum + VADDLVAs, // same as VADDLV[su] but also add an input accumulator + VADDLVAu, // provided as low and high halves + VADDLVps, // same as VADDLVs but with a v4i1 predicate mask + VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask + VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask + VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask VMLAVs, VMLAVu, VMLALVs, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1669,6 +1669,10 @@ case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; + case ARMISD::VADDLVps: return "ARMISD::VADDLVps"; + case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu"; + case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps"; + case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; @@ -11816,18 +11820,15 @@ return SDValue(); SDLoc dl(N); - SDValue Lo = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DCI.DAG.getConstant(0, dl, MVT::i32)); - SDValue Hi = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DCI.DAG.getConstant(1, dl, MVT::i32)); - SDValue Red = - VecRed->getNumOperands() == 1 - ? DCI.DAG.getNode(OpcodeA, dl, - DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, - VecRed->getOperand(0)) - : DCI.DAG.getNode(OpcodeA, dl, - DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, - VecRed->getOperand(0), VecRed->getOperand(1)); + SmallVector Ops; + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(0, dl, MVT::i32))); + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(1, dl, MVT::i32))); + for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) + Ops.push_back(VecRed->getOperand(i)); + SDValue Red = DCI.DAG.getNode(OpcodeA, dl, + DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, SDValue(Red.getNode(), 1)); }; @@ -11840,6 +11841,14 @@ return M; if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) + return M; if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) return M; if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) @@ -14373,6 +14382,34 @@ return SDValue(); break; } + + case Intrinsic::arm_mve_addv: { + // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, + // which allow PerformADDVecReduce to turn it into VADDLV when possible. + bool Unsigned = cast(N->getOperand(2))->getZExtValue(); + unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); + } + + case Intrinsic::arm_mve_addlv: + case Intrinsic::arm_mve_addlv_predicated: { + // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR + // which recombines the two outputs into an i64 + bool Unsigned = cast(N->getOperand(2))->getZExtValue(); + unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? + (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : + (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); + + SmallVector Ops; + for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) + if (i != 2) // skip the unsigned flag + Ops.push_back(N->getOperand(i)); + + SDLoc dl(N); + SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), + val.getValue(1)); + } } return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -607,60 +607,59 @@ let Inst{0} = 0b0; } -multiclass MVE_VADDV_A size, - list pattern=[]> { - def acc : MVE_VADDV<"vaddva", suffix, +def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; +def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + +multiclass MVE_VADDV_A { + def acc : MVE_VADDV<"vaddva", VTI.Suffix, (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", - 0b1, U, size, pattern>; - def no_acc : MVE_VADDV<"vaddv", suffix, + 0b1, VTI.Unsigned, VTI.Size>; + def no_acc : MVE_VADDV<"vaddv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, size, pattern>; -} + 0b0, VTI.Unsigned, VTI.Size>; -defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; -defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; -defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; -defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; -defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; -defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; + defvar InstA = !cast(NAME # "acc"); + defvar InstN = !cast(NAME # "no_acc"); -def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; -def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + let Predicates = [HasMVEInt] in { + if VTI.Unsigned then { + def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } else { + def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } -let Predicates = [HasMVEInt] in { - def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), - (i32 (MVE_VADDVu32no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), - (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), - (i32 (MVE_VADDVu8no_acc $src))>; - - def : Pat<(i32 (ARMVADDVs (v8i16 MQPR:$src))), - (i32 (MVE_VADDVs16no_acc $src))>; - def : Pat<(i32 (ARMVADDVu (v8i16 MQPR:$src))), - (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (ARMVADDVs (v16i8 MQPR:$src))), - (i32 (MVE_VADDVs8no_acc $src))>; - def : Pat<(i32 (ARMVADDVu (v16i8 MQPR:$src))), - (i32 (MVE_VADDVu8no_acc $src))>; - - def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu32acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; - - def : Pat<(i32 (add (i32 (ARMVADDVs (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVs16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVu (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVs (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVs8acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVu (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; + def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred)), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + } } +defm MVE_VADDVs8 : MVE_VADDV_A; +defm MVE_VADDVs16 : MVE_VADDV_A; +defm MVE_VADDVs32 : MVE_VADDV_A; +defm MVE_VADDVu8 : MVE_VADDV_A; +defm MVE_VADDVu16 : MVE_VADDV_A; +defm MVE_VADDVu32 : MVE_VADDV_A; + class MVE_VADDLV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -681,20 +680,6 @@ let Inst{0} = 0b0; } -multiclass MVE_VADDLV_A pattern=[]> { - def acc : MVE_VADDLV<"vaddlva", suffix, - (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), - "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", - 0b1, U, pattern>; - def no_acc : MVE_VADDLV<"vaddlv", suffix, - (ins MQPR:$Qm), "", - 0b0, U, pattern>; -} - - -defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; -defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; - def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> ]>; @@ -702,23 +687,49 @@ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, SDTCisVec<4> ]>; -def ARMVADDLVs : SDNode<"ARMISD::VADDLVs", SDTVecReduceL>; -def ARMVADDLVu : SDNode<"ARMISD::VADDLVu", SDTVecReduceL>; -def ARMVADDLVAs : SDNode<"ARMISD::VADDLVAs", SDTVecReduceLA>; -def ARMVADDLVAu : SDNode<"ARMISD::VADDLVAu", SDTVecReduceLA>; +def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2> +]>; +def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; -let Predicates = [HasMVEInt] in { - def : Pat<(ARMVADDLVs (v4i32 MQPR:$val1)), - (MVE_VADDLVs32no_acc (v4i32 MQPR:$val1))>; - def : Pat<(ARMVADDLVu (v4i32 MQPR:$val1)), - (MVE_VADDLVu32no_acc (v4i32 MQPR:$val1))>; +multiclass MVE_VADDLV_A { + def acc : MVE_VADDLV<"vaddlva", VTI.Suffix, + (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + 0b1, VTI.Unsigned>; + def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix, + (ins MQPR:$Qm), "", + 0b0, VTI.Unsigned>; + + defvar InstA = !cast(NAME # "acc"); + defvar InstN = !cast(NAME # "no_acc"); - def : Pat<(ARMVADDLVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)), - (MVE_VADDLVs32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>; - def : Pat<(ARMVADDLVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)), - (MVE_VADDLVu32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>; + defvar letter = VTI.SuffixLetter; + defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>; + defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>; + defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>; + defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>; + + let Predicates = [HasMVEInt] in { + def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)), + (InstN (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)), + (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>; + def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + (VTI.Pred VCCR:$pred)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred))>; + } } +defm MVE_VADDLVs32 : MVE_VADDLV_A; +defm MVE_VADDLVu32 : MVE_VADDLV_A; + class MVE_VMINMAXNMV pattern=[]> : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll @@ -0,0 +1,416 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc i32 @test_vaddvq_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vaddvq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vaddvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vaddvq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vaddvq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s8(i32 %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaddvaq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s16(i32 %a, <8 x i16> %b) { +; CHECK-LABEL: test_vaddvaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddvaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u8(i32 %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaddvaq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u16(i32 %a, <8 x i16> %b) { +; CHECK-LABEL: test_vaddvaq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddvaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s8(<16 x i8> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 0, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s16(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 0, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u8(<16 x i8> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 1, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u16(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 1, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s8(i32 %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 0, <16 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s16(i32 %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 0, <8 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u8(i32 %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 1, <16 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u16(i32 %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 1, <8 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddlvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlv.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 0) + ret i64 %0 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddlvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlv.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 1) + ret i64 %0 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_s32(i64 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddlvaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlva.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 0) + %1 = add i64 %0, %a + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_u32(i64 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddlvaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlva.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 1) + %1 = add i64 %0, %a + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_p_s32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1) + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_p_u32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvt.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1) + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_s32(i64 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvat.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1) + %3 = add i64 %2, %a + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_u32(i64 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvat.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1) + %3 = add i64 %2, %a + ret i64 %3 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32) +declare i32 @llvm.arm.mve.addv.v8i16(<8 x i16>, i32) +declare i32 @llvm.arm.mve.addv.v4i32(<4 x i32>, i32) +declare i64 @llvm.arm.mve.addlv.v4i32(<4 x i32>, i32) + +declare i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8>, i32, <16 x i1>) +declare i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>) +declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) +declare i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)