diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -202,6 +202,24 @@ defm vfms: FMA<0>; } +let params = T.Int, pnt = PNT_NType in { + def vmlaq_n: Intrinsic< + Vector, (args Vector:$addend, Vector:$m1, unpromoted:$m2_s), + (add (mul $m1, (splat $m2_s)), $addend)>; + def vmlasq_n: Intrinsic< + Vector, (args Vector:$m1, Vector:$m2, unpromoted:$addend_s), + (add (mul $m1, $m2), (splat $addend_s))>; + + def vmlaq_m_n: Intrinsic< + Vector, (args Vector:$addend, Vector:$m1, Scalar:$m2_s, Predicate:$pred), + (IRInt<"vmla_n_predicated", [Vector, Predicate]> + $addend, $m1, $m2_s, $pred)>; + def vmlasq_m_n: Intrinsic< + Vector, (args Vector:$m1, Vector:$m2, Scalar:$addend_s, Predicate:$pred), + (IRInt<"vmlas_n_predicated", [Vector, Predicate]> + $m1, $m2, $addend_s, $pred)>; +} + let params = !listconcat(T.Int16, T.Int32) in { let pnt = PNT_None in { def vmvnq_n: Intrinsic undef, i8 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vmlaq_n_s8(int8x16_t a, int8x16_t b, int8_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_s8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_s16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_s32(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vmlaq_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_u8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_u16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[B:%.*]], [[DOTSPLAT]] +// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { +#ifdef POLYMORPHIC + return vmlaq(a, b, c); +#else /* POLYMORPHIC */ + return vmlaq_n_u32(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vmlasq_n_s8(int8x16_t a, int8x16_t b, int8_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_s8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vmlasq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_s16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_vmlasq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_s32(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vmlasq_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_u8(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vmlasq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_u16(a, b, c); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], [[DOTSPLAT]] +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vmlasq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { +#ifdef POLYMORPHIC + return vmlasq(a, b, c); +#else /* POLYMORPHIC */ + return vmlasq_n_u32(a, b, c); +#endif /* POLYMORPHIC */ +} + // CHECK-LABEL: @test_vfmaq_m_f16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 @@ -259,3 +451,191 @@ return vfmsq_m_f32(a, b, c, p); #endif /* POLYMORPHIC */ } + +// CHECK-LABEL: @test_vmlaq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vmlaq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_s8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vmlaq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_s16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vmlaq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_s32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vmlaq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_u8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vmlaq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_u16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlaq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vmlaq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlaq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlaq_m_n_u32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vmlasq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_s8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vmlasq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_s16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vmlasq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_s32(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vmlasq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_u8(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vmlasq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_u16(a, b, c, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmlasq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vmlasq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmlasq_m(a, b, c, p); +#else /* POLYMORPHIC */ + return vmlasq_m_n_u32(a, b, c, p); +#endif /* POLYMORPHIC */ +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1246,6 +1246,14 @@ def int_arm_mve_fma_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */, LLVMMatchType<0> /* addend */, llvm_anyvector_ty /* pred */], [IntrNoMem]>; +def int_arm_mve_vmla_n_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* addend */, + llvm_i32_ty /* mult op #2 (scalar) */, llvm_anyvector_ty /* pred */], + [IntrNoMem]>; +def int_arm_mve_vmlas_n_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */, + llvm_i32_ty /* addend (scalar) */, llvm_anyvector_ty /* pred */], + [IntrNoMem]>; // CDE (Custom Datapath Extension) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -352,6 +352,7 @@ SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool SimplifyDemandedBitsForTargetNode(SDValue Op, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -911,7 +911,6 @@ setOperationAction(ISD::FMA, MVT::v4f32, Expand); } - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); @@ -939,6 +938,7 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::VECREDUCE_ADD); @@ -14173,7 +14173,9 @@ } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. -static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { +SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: @@ -14322,6 +14324,19 @@ case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; + + case Intrinsic::arm_mve_vmla_n_predicated: + case Intrinsic::arm_mve_vmlas_n_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they return. So we don't need + // any bits of that operand above that point, which allows us to eliminate + // uxth/sxth. + unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) + return SDValue(); + break; + } } return SDValue(); @@ -15041,7 +15056,8 @@ return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5152,48 +5152,49 @@ let validForTailPredication = 1; } -def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; -def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>; -def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>; -def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>; -def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>; -def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>; - -def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>; -def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>; -def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>; -def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; -def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; -def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; +multiclass MVE_VMLA_qr_multi { + def "": MVE_VFMAMLA_qr; + defvar Inst = !cast(NAME); + defvar pred_int = !cast("int_arm_mve_" # iname # "_n_predicated"); + defvar v1 = (VTI.Vec MQPR:$v1); + defvar v2 = (VTI.Vec MQPR:$v2); + defvar vs = (VTI.Vec (ARMvdup rGPR:$s)); + defvar s = (i32 rGPR:$s); + defvar pred = (VTI.Pred VCCR:$pred); -let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (add (v4i32 MQPR:$src1), - (v4i32 (mul (v4i32 MQPR:$src2), - (v4i32 (ARMvdup (i32 rGPR:$x))))))), - (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$src1), - (v8i16 (mul (v8i16 MQPR:$src2), - (v8i16 (ARMvdup (i32 rGPR:$x))))))), - (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; - def : Pat<(v16i8 (add (v16i8 MQPR:$src1), - (v16i8 (mul (v16i8 MQPR:$src2), - (v16i8 (ARMvdup (i32 rGPR:$x))))))), - (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; - - def : Pat<(v4i32 (add (v4i32 (ARMvdup (i32 rGPR:$x))), - (v4i32 (mul (v4i32 MQPR:$src1), - (v4i32 MQPR:$src2))))), - (v4i32 (MVE_VMLAS_qr_u32 $src1, $src2, $x))>; - def : Pat<(v8i16 (add (v8i16 (ARMvdup (i32 rGPR:$x))), - (v8i16 (mul (v8i16 MQPR:$src1), - (v8i16 MQPR:$src2))))), - (v8i16 (MVE_VMLAS_qr_u16 $src1, $src2, $x))>; - def : Pat<(v16i8 (add (v16i8 (ARMvdup (i32 rGPR:$x))), - (v16i8 (mul (v16i8 MQPR:$src1), - (v16i8 MQPR:$src2))))), - (v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>; + // The signed and unsigned variants of this instruction have different + // encodings, but they're functionally identical. For the sake of + // determinism, we generate only the unsigned variant. + if VTI.Unsigned then let Predicates = [HasMVEInt] in { + if scalar_addend then { + def : Pat<(VTI.Vec (add (mul v1, v2), vs)), + (VTI.Vec (Inst v1, v2, s))>; + } else { + def : Pat<(VTI.Vec (add (mul v2, vs), v1)), + (VTI.Vec (Inst v1, v2, s))>; + } + + def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)), + (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>; + } } +defm MVE_VMLA_qr_s8 : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>; +defm MVE_VMLA_qr_s16 : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>; +defm MVE_VMLA_qr_s32 : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>; +defm MVE_VMLA_qr_u8 : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>; +defm MVE_VMLA_qr_u16 : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>; +defm MVE_VMLA_qr_u32 : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>; + +defm MVE_VMLAS_qr_s8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>; +defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>; +defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>; +defm MVE_VMLAS_qr_u8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>; +defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>; +defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>; + multiclass MVE_VFMA_qr_multi { def "": MVE_VFMAMLA_qr; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll @@ -101,6 +101,168 @@ ret <4 x float> %1 } +define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { +; CHECK-LABEL: test_vmlaq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = mul <16 x i8> %.splat, %b + %1 = add <16 x i8> %0, %a + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { +; CHECK-LABEL: test_vmlaq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = mul <8 x i16> %.splat, %b + %1 = add <8 x i16> %0, %a + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vmlaq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = mul <4 x i32> %.splat, %b + %1 = add <4 x i32> %0, %a + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) { +; CHECK-LABEL: test_vmlaq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %0 = mul <16 x i8> %.splat, %b + %1 = add <16 x i8> %0, %a + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) { +; CHECK-LABEL: test_vmlaq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %0 = mul <8 x i16> %.splat, %b + %1 = add <8 x i16> %0, %a + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vmlaq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmla.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %0 = mul <4 x i32> %.splat, %b + %1 = add <4 x i32> %0, %a + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { +; CHECK-LABEL: test_vmlasq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u8 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <16 x i8> %b, %a + %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = add <16 x i8> %.splat, %0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { +; CHECK-LABEL: test_vmlasq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u16 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <8 x i16> %b, %a + %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = add <8 x i16> %.splat, %0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vmlasq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u32 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <4 x i32> %b, %a + %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = add <4 x i32> %.splat, %0 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) { +; CHECK-LABEL: test_vmlasq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u8 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <16 x i8> %b, %a + %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 + %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = add <16 x i8> %.splat, %0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) { +; CHECK-LABEL: test_vmlasq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u16 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <8 x i16> %b, %a + %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 + %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = add <8 x i16> %.splat, %0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) { +; CHECK-LABEL: test_vmlasq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmlas.u32 q1, q0, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = mul <4 x i32> %b, %a + %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 + %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = add <4 x i32> %.splat, %0 + ret <4 x i32> %1 +} + define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_f16: ; CHECK: @ %bb.0: @ %entry @@ -233,6 +395,183 @@ ret <4 x float> %3 } +define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlaq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlat.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %c to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { +; CHECK-LABEL: test_vmlasq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmlast.u32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) @@ -240,3 +579,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>) declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)