diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -220,6 +220,29 @@ $m1, $m2, $addend_s, $pred)>; } +multiclass VQDMLA { + def hq_n: Intrinsic< + Vector, (args Vector:$addend, Vector:$m1, Scalar:$m2_s), + (IRInt $addend, $m1, $m2_s)>; + def shq_n: Intrinsic< + Vector, (args Vector:$m1, Vector:$m2, Scalar:$addend_s), + (IRInt $m1, $m2, $addend_s)>; + + def hq_m_n: Intrinsic< + Vector, (args Vector:$addend, Vector:$m1, Scalar:$m2_s, Predicate:$pred), + (IRInt + $addend, $m1, $m2_s, $pred)>; + def shq_m_n: Intrinsic< + Vector, (args Vector:$m1, Vector:$m2, Scalar:$addend_s, Predicate:$pred), + (IRInt + $m1, $m2, $addend_s, $pred)>; +} + +let params = T.Signed, pnt = PNT_NType in { + defm vqdmla: VQDMLA; + defm vqrdmla: VQDMLA; +} + let params = !listconcat(T.Int16, T.Int32) in { let pnt = PNT_None in { def vmvnq_n: Intrinsic @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vqdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) { +#ifdef POLYMORPHIC + return vqdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqdmlahq_n_s8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlahq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vqdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +#ifdef POLYMORPHIC + return vqdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqdmlahq_n_s16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlahq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +#ifdef POLYMORPHIC + return vqdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqdmlahq_n_s32(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vqrdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) { +#ifdef POLYMORPHIC + return vqrdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlahq_n_s8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vqrdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +#ifdef POLYMORPHIC + return vqrdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlahq_n_s16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +#ifdef POLYMORPHIC + return vqrdmlahq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlahq_n_s32(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vqrdmlashq_n_s8(int8x16_t a, int8x16_t b, int8_t c) { +#ifdef POLYMORPHIC + return vqrdmlashq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlashq_n_s8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vqrdmlashq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +#ifdef POLYMORPHIC + return vqrdmlashq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlashq_n_s16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmlashq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +#ifdef POLYMORPHIC + return vqrdmlashq(a, b, c); +#else /* POLYMORPHIC */ + return vqrdmlashq_n_s32(a, b, c); +#endif /* POLYMORPHIC */ +} + // CHECK-LABEL: @test_vfmaq_m_f16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 @@ -639,3 +762,145 @@ return vmlasq_m_n_u32(a, b, c, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vqdmlahq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vqdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqdmlahq_m_n_s8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlahq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vqdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqdmlahq_m_n_s16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlahq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqdmlahq_m_n_s32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vqrdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlahq_m_n_s8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vqrdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlahq_m_n_s16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlahq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlahq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlahq_m_n_s32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vqrdmlashq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlashq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlashq_m_n_s8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vqrdmlashq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlashq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlashq_m_n_s16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlashq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmlashq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlashq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vqrdmlashq_m_n_s32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1255,6 +1255,19 @@ llvm_i32_ty /* addend (scalar) */, llvm_anyvector_ty /* pred */], [IntrNoMem]>; +defm int_arm_mve_vqdmlah: MVEPredicated<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* addend */, + llvm_i32_ty /* mult op #2 (scalar) */]>; +defm int_arm_mve_vqrdmlah: MVEPredicated<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* addend */, + llvm_i32_ty /* mult op #2 (scalar) */]>; +defm int_arm_mve_vqdmlash: MVEPredicated<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */, + llvm_i32_ty /* addend (scalar) */]>; +defm int_arm_mve_vqrdmlash: MVEPredicated<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */, + llvm_i32_ty /* addend (scalar) */]>; + // CDE (Custom Datapath Extension) def int_arm_cde_cx1: Intrinsic< diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14291,8 +14291,16 @@ // No immediate versions of these to check for. break; + case Intrinsic::arm_mve_vqdmlah: + case Intrinsic::arm_mve_vqdmlash: + case Intrinsic::arm_mve_vqrdmlah: + case Intrinsic::arm_mve_vqrdmlash: case Intrinsic::arm_mve_vmla_n_predicated: - case Intrinsic::arm_mve_vmlas_n_predicated: { + case Intrinsic::arm_mve_vmlas_n_predicated: + case Intrinsic::arm_mve_vqdmlah_predicated: + case Intrinsic::arm_mve_vqdmlash_predicated: + case Intrinsic::arm_mve_vqrdmlah_predicated: + case Intrinsic::arm_mve_vqrdmlash_predicated: { // These intrinsics all take an i32 scalar operand which is narrowed to the // size of a single lane of the vector type they return. So we don't need // any bits of that operand above that point, which allows us to eliminate diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5240,10 +5240,30 @@ let Inst{5} = bit_5; } +multiclass MVE_VQDMLAH_qr_multi { + def "": MVE_VQDMLAH_qr; + defvar Inst = !cast(NAME); + defvar unpred_int = !cast("int_arm_mve_" # iname); + defvar pred_int = !cast("int_arm_mve_" # iname # "_predicated"); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), (VTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + } +} + multiclass MVE_VQDMLAH_qr_types { - def s8 : MVE_VQDMLAH_qr; - def s16 : MVE_VQDMLAH_qr; - def s32 : MVE_VQDMLAH_qr; + defm s8 : MVE_VQDMLAH_qr_multi; + defm s16 : MVE_VQDMLAH_qr_multi; + defm s32 : MVE_VQDMLAH_qr_multi; } defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll @@ -263,6 +263,102 @@ ret <4 x i32> %1 } +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { +; CHECK-LABEL: test_vqdmlahq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlah.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { +; CHECK-LABEL: test_vqdmlahq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlah.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vqdmlahq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlah.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { +; CHECK-LABEL: test_vqrdmlahq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { +; CHECK-LABEL: test_vqrdmlahq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vqrdmlahq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { +; CHECK-LABEL: test_vqrdmlashq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlash.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { +; CHECK-LABEL: test_vqrdmlashq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlash.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vqrdmlashq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlash.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) + ret <4 x i32> %0 +} + define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_f16: ; CHECK: @ %bb.0: @ %entry @@ -571,6 +667,138 @@ ret <4 x i32> %2 } +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlahq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlaht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlahq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlaht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlahq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlaht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlahq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlaht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlahq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlaht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlahq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlaht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlashq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlasht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlashq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlasht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlashq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlasht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) @@ -585,3 +813,21 @@ declare <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) declare <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)