diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -80,10 +80,8 @@ def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>; -def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>; -def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>; -def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; -def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_QRDMLAH_LN : Op<(call "vqrdmlah", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; +def OP_QRDMLSH_LN : Op<(call "vqrdmlsh", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>; def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>; def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2), @@ -185,10 +183,10 @@ def OP_SCALAR_QDMULH_LN : ScalarMulOp<"vqdmulh">; def OP_SCALAR_QRDMULH_LN : ScalarMulOp<"vqrdmulh">; -def OP_SCALAR_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, - (call "vget_lane", $p2, $p3)))>; -def OP_SCALAR_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, - (call "vget_lane", $p2, $p3)))>; +def OP_SCALAR_QRDMLAH_LN : Op<(call "vqrdmlah", $p0, $p1, + (call "vget_lane", $p2, $p3))>; +def OP_SCALAR_QRDMLSH_LN : Op<(call "vqrdmlsh", $p0, $p1, + (call "vget_lane", $p2, $p3))>; def OP_SCALAR_HALF_GET_LN : Op<(bitcast "float16_t", (call "vget_lane", @@ -326,8 +324,8 @@ def VQRDMULH : SInst<"vqrdmulh", "...", "siQsQi">; let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in { -def VQRDMLAH : SOpInst<"vqrdmlah", "....", "siQsQi", OP_QRDMLAH>; -def VQRDMLSH : SOpInst<"vqrdmlsh", "....", "siQsQi", OP_QRDMLSH>; +def VQRDMLAH : SInst<"vqrdmlah", "....", "siQsQi">; +def VQRDMLSH : SInst<"vqrdmlsh", "....", "siQsQi">; } def VQDMLAL : SInst<"vqdmlal", "(>Q)(>Q)..", "si">; @@ -1400,11 +1398,11 @@ let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { //////////////////////////////////////////////////////////////////////////////// // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half -def SCALAR_SQRDMLAH : SOpInst<"vqrdmlah", "1111", "SsSi", OP_QRDMLAH>; +def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">; //////////////////////////////////////////////////////////////////////////////// // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half -def SCALAR_SQRDMLSH : SOpInst<"vqrdmlsh", "1111", "SsSi", OP_QRDMLSH>; +def SCALAR_SQRDMLSH : SInst<"vqrdmlsh", "1111", "SsSi">; } //////////////////////////////////////////////////////////////////////////////// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5874,6 +5874,10 @@ NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType), NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType), NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType), + NEONMAP1(vqrdmlah_v, arm_neon_vqrdmlah, Add1ArgType), + NEONMAP1(vqrdmlahq_v, arm_neon_vqrdmlah, Add1ArgType), + NEONMAP1(vqrdmlsh_v, arm_neon_vqrdmlsh, Add1ArgType), + NEONMAP1(vqrdmlshq_v, arm_neon_vqrdmlsh, Add1ArgType), NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType), NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType), NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts), @@ -6099,6 +6103,10 @@ NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType), NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType), NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqrdmlah_v, aarch64_neon_sqrdmlah, Add1ArgType), + NEONMAP1(vqrdmlahq_v, aarch64_neon_sqrdmlah, Add1ArgType), + NEONMAP1(vqrdmlsh_v, aarch64_neon_sqrdmlsh, Add1ArgType), + NEONMAP1(vqrdmlshq_v, aarch64_neon_sqrdmlsh, Add1ArgType), NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0), NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0), NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType), @@ -6301,6 +6309,10 @@ NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType), NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors), NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType), + NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType), NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors), NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType), NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors), diff --git a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c --- a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c @@ -11,9 +11,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]] -// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] // int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlah_laneq_s16(a, b, v, 7); @@ -24,9 +23,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] // int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlah_laneq_s32(a, b, v, 3); @@ -37,9 +35,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] // int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlahq_laneq_s16(a, b, v, 7); @@ -50,9 +47,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] // int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlahq_laneq_s32(a, b, v, 3); @@ -60,15 +56,12 @@ // CHECK-LABEL: @test_vqrdmlahh_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) { return vqrdmlahh_s16(a, b, c); @@ -76,9 +69,8 @@ // CHECK-LABEL: @test_vqrdmlahs_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQADDS_S32_I]] +// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]] // int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) { return vqrdmlahs_s32(a, b, c); @@ -87,15 +79,12 @@ // CHECK-LABEL: @test_vqrdmlahh_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) { return vqrdmlahh_lane_s16(a, b, c, 3); @@ -104,9 +93,8 @@ // CHECK-LABEL: @test_vqrdmlahs_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1 -// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQADDS_S32_I]] +// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]] // int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) { return vqrdmlahs_lane_s32(a, b, c, 1); @@ -115,15 +103,12 @@ // CHECK-LABEL: @test_vqrdmlahh_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) { return vqrdmlahh_laneq_s16(a, b, c, 7); @@ -132,9 +117,8 @@ // CHECK-LABEL: @test_vqrdmlahs_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 -// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQADDS_S32_I]] +// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]] // int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) { return vqrdmlahs_laneq_s32(a, b, c, 3); @@ -145,9 +129,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] // int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlsh_laneq_s16(a, b, v, 7); @@ -158,9 +141,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] // int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlsh_laneq_s32(a, b, v, 3); @@ -171,9 +153,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] // int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlshq_laneq_s16(a, b, v, 7); @@ -184,9 +165,8 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] // int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlshq_laneq_s32(a, b, v, 3); @@ -194,15 +174,12 @@ // CHECK-LABEL: @test_vqrdmlshh_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) { return vqrdmlshh_s16(a, b, c); @@ -210,9 +187,8 @@ // CHECK-LABEL: @test_vqrdmlshs_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VQRDMULHS_S32_I_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]] +// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]] // int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) { return vqrdmlshs_s32(a, b, c); @@ -221,15 +197,12 @@ // CHECK-LABEL: @test_vqrdmlshh_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) { return vqrdmlshh_lane_s16(a, b, c, 3); @@ -238,9 +211,8 @@ // CHECK-LABEL: @test_vqrdmlshs_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1 -// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]] +// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]] // int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) { return vqrdmlshs_lane_s32(a, b, c, 1); @@ -249,15 +221,12 @@ // CHECK-LABEL: @test_vqrdmlshh_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 -// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 -// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0 -// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 -// CHECK-NEXT: ret i16 [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP3]] // int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) { return vqrdmlshh_laneq_s16(a, b, c, 7); @@ -266,9 +235,8 @@ // CHECK-LABEL: @test_vqrdmlshs_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 -// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]] -// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]] -// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]] +// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]] +// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]] // int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) { return vqrdmlshs_laneq_s32(a, b, c, 3); diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c --- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c @@ -13,15 +13,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlah_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4:[0-9]+]] -// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]] +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]] -// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]] +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] // int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -30,15 +28,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlah_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] // int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -47,15 +43,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlahq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] // int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -64,15 +58,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlahq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] // int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -84,18 +76,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] // int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -107,18 +97,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] // int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -130,18 +118,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] // int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -153,18 +139,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] // int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { @@ -173,15 +157,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlsh_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] // int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -190,15 +172,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlsh_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] // int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -207,15 +187,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlshq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] // int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -224,15 +202,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlshq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] // int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -244,18 +220,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] // int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -267,18 +241,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] // int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -290,18 +262,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] // int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -313,18 +283,16 @@ // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]] -// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]] -// CHECK-ARM-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]] -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]] +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] // int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -162,6 +162,10 @@ [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + class AdvSIMD_3IntArg_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; class AdvSIMD_3VectorArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], @@ -258,6 +262,9 @@ def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic; def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic; + def int_aarch64_neon_sqrdmlah : AdvSIMD_3IntArg_Intrinsic; + def int_aarch64_neon_sqrdmlsh : AdvSIMD_3IntArg_Intrinsic; + // Vector Polynominal Multiply def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic; diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -764,6 +764,9 @@ def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic; +def int_arm_neon_vqrdmlah : Neon_3Arg_Intrinsic; +def int_arm_neon_vqrdmlsh : Neon_3Arg_Intrinsic; + // Armv8.2-A dot product instructions class Neon_Dot_Intrinsic : Intrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -10556,40 +10556,30 @@ pattern> { } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), - (v4i16 V64:$Rm)))))]>; + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), - (v8i16 V128:$Rm)))))]>; + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), - (v2i32 V64:$Rm)))))]>; + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), - (v4i32 V128:$Rm)))))]>; + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh - (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -10600,11 +10590,9 @@ V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh - (v8i16 V128:$Rn), - (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -10615,75 +10603,26 @@ V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we - // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (insert_subvector - (undef), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i64 0))), - (i64 0))))), - (EXTRACT_SUBREG - (v2i32 (!cast(NAME # v2i32_indexed) - (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V64:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i64 0))))), - (EXTRACT_SUBREG - (v4i32 (!cast(NAME # v4i32_indexed) - (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V128:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", @@ -10698,11 +10637,9 @@ FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i32 FPR32Op:$dst), - (Accum (i32 FPR32Op:$Rd), - (i32 (int_aarch64_neon_sqrdmulh - (i32 FPR32Op:$Rn), - (i32 (vector_extract (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4511,9 +4511,9 @@ defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; // Extra saturate patterns, other than the intrinsics matches above defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; @@ -4780,15 +4780,11 @@ let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; - def : Pat<(i32 (int_aarch64_neon_sqadd - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; - def : Pat<(i32 (int_aarch64_neon_sqsub - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; } @@ -6405,9 +6401,9 @@ defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", int_aarch64_neon_sqsub>; defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4526,64 +4526,48 @@ defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (saddsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 DPR:$Vm))))), + def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))), (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (saddsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 DPR:$Vm))))), + def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))), (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (saddsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), - (v8i16 QPR:$Vm))))), + def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))), (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (saddsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), - (v4i32 QPR:$Vm))))), + def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))), (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (saddsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh + def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), - imm:$lane)))))), + imm:$lane)))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (saddsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh + def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), - imm:$lane)))))), + imm:$lane)))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (saddsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh + def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v8i16 (ARMvduplane (v8i16 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (saddsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v4i32 (ARMvduplane (v4i32 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG @@ -4596,63 +4580,47 @@ defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (ssubsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 DPR:$Vm))))), + def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))), (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (ssubsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 DPR:$Vm))))), + def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))), (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (ssubsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), - (v8i16 QPR:$Vm))))), + def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))), (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (ssubsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), - (v4i32 QPR:$Vm))))), + def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))), (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (ssubsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh + def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), - imm:$lane)))))), + imm:$lane)))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (ssubsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh + def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), - imm:$lane)))))), + imm:$lane)))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (ssubsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh + def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v8i16 (ARMvduplane (v8i16 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (ssubsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v4i32 (ARMvduplane (v4i32 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -20,6 +20,21 @@ declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) +declare <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqrdmlah.i32(i32, i32, i32) + +declare <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32, i32, i32) + +; The sadd intrinsics in this file previously transformed into sqrdmlah where they +; shouldn't. They should produce sqrdmulh and sqadd. + ;----------------------------------------------------------------------------- ; RDMA Vector ; test for SIMDThreeSameVectorSQRDMLxHTiedHS @@ -27,7 +42,8 @@ define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_sqrdmlah_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.4h +; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) @@ -37,7 +53,8 @@ define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_sqrdmlah_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.8h +; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) @@ -47,7 +64,8 @@ define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_sqrdmlah_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.2s +; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) @@ -57,7 +75,8 @@ define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_sqrdmlah_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.4s +; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) @@ -67,7 +86,8 @@ define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_sqrdmlsh_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.4h +; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) @@ -77,7 +97,8 @@ define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_sqrdmlsh_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.8h +; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) @@ -87,7 +108,8 @@ define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_sqrdmlsh_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.2s +; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) @@ -97,7 +119,8 @@ define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_sqrdmlsh_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.4s +; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) @@ -112,7 +135,8 @@ ; CHECK-LABEL: test_sqrdmlah_lane_s16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -124,7 +148,8 @@ define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { ; CHECK-LABEL: test_sqrdmlahq_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.h[2] +; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> @@ -137,7 +162,8 @@ ; CHECK-LABEL: test_sqrdmlah_lane_s32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[1] +; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -149,7 +175,8 @@ define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { ; CHECK-LABEL: test_sqrdmlahq_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.s[0] +; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer @@ -162,7 +189,8 @@ ; CHECK-LABEL: test_sqrdmlsh_lane_s16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -174,7 +202,8 @@ define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { ; CHECK-LABEL: test_sqrdmlshq_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.h[2] +; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> @@ -187,7 +216,8 @@ ; CHECK-LABEL: test_sqrdmlsh_lane_s32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[1] +; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -199,7 +229,8 @@ define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { ; CHECK-LABEL: test_sqrdmlshq_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.s[0] +; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer @@ -216,10 +247,11 @@ define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmlah v2.4h, v0.4h, v1.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -233,9 +265,10 @@ define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlah v2.8h, v0.8h, v1.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> @@ -249,10 +282,11 @@ define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmlah v2.2s, v0.2s, v1.s[0] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -265,9 +299,10 @@ define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlah v2.4s, v0.4s, v1.s[0] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer @@ -280,10 +315,11 @@ define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmlsh v2.4h, v0.4h, v1.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -297,9 +333,10 @@ define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh v2.8h, v0.8h, v1.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> @@ -313,10 +350,11 @@ define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: sqrdmlsh v2.2s, v0.2s, v1.s[0] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -329,9 +367,10 @@ define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh v2.4s, v0.4s, v1.s[0] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer @@ -350,9 +389,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 ; CHECK-NEXT: fmov s1, w2 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlah v2.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 @@ -368,9 +408,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 ; CHECK-NEXT: fmov s1, w2 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlah v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 @@ -387,9 +428,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 ; CHECK-NEXT: fmov s1, w2 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh v2.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 @@ -405,9 +447,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 ; CHECK-NEXT: fmov s1, w2 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 @@ -422,10 +465,11 @@ ; CHECK-LABEL: test_sqrdmlah_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w2 +; CHECK-NEXT: sqrdmulh s0, s0, s1 ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: fmov s2, w2 -; CHECK-NEXT: sqrdmlah s1, s0, s2 -; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: sqadd s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) @@ -436,10 +480,11 @@ ; CHECK-LABEL: test_sqrdmlsh_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w2 +; CHECK-NEXT: sqrdmulh s0, s0, s1 ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: fmov s2, w2 -; CHECK-NEXT: sqrdmlsh s1, s0, s2 -; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: sqsub s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) @@ -455,10 +500,11 @@ ; CHECK-LABEL: test_sqrdmlah_extract_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w1 -; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.4h, v1.4h, v0.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 @@ -473,9 +519,10 @@ ; CHECK-LABEL: test_sqrdmlah_extract_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w1 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlah s2, s1, v0.s[3] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqadd s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %extract = extractelement <4 x i32> %rhs, i32 3 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) @@ -487,9 +534,10 @@ ; CHECK-LABEL: test_sqrdmlshq_extract_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w1 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh v2.8h, v1.8h, v0.h[1] -; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: sqrdmulh v0.8h, v1.8h, v0.h[1] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 @@ -504,12 +552,297 @@ ; CHECK-LABEL: test_sqrdmlsh_extract_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w1 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[3] -; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqsub s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %extract = extractelement <4 x i32> %rhs, i32 3 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) ret i32 %retval } + + +;----------------------------------------------------------------------------- +; Using sqrdmlah intrinsics + +define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmlah_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4 + ret <4 x i16> %vqrdmlah_v3.i +} + +define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmlah_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> + %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4 + ret <2 x i32> %vqrdmlah_v3.i +} + +define <8 x i16> @test_vqrdmlahq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmlahq_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4 + ret <8 x i16> %vqrdmlahq_v3.i +} + +define <4 x i32> @test_vqrdmlahq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmlahq_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.s[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> + %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4 + ret <4 x i32> %vqrdmlahq_v3.i +} + +define i16 @test_vqrdmlahh_s16(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: test_vqrdmlahh_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqrdmlah v1.4h, v0.4h, v2.4h +; CHECK-NEXT: umov w0, v1.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = insertelement <4 x i16> undef, i16 %c, i64 0 + %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_vqrdmlahs_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqrdmlah s1, s0, s2 +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: ret +entry: + %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4 + ret i32 %vqrdmlahs_s32.i +} + +define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlahh_lane_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[3] +; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> + %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlahs_lane_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqrdmlah s2, s1, v0.s[1] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <2 x i32> %c, i64 1 + %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4 + ret i32 %vqrdmlahs_s32.i +} + +define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) { +; CHECK-LABEL: test_vqrdmlahh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[7] +; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> + %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlahs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) { +; CHECK-LABEL: test_vqrdmlahs_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqrdmlah s2, s1, v0.s[3] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <4 x i32> %c, i64 3 + %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4 + ret i32 %vqrdmlahs_s32.i +} + +define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmlsh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4 + ret <4 x i16> %vqrdmlsh_v3.i +} + +define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmlsh_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> + %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4 + ret <2 x i32> %vqrdmlsh_v3.i +} + +define <8 x i16> @test_vqrdmlshq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmlshq_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4 + ret <8 x i16> %vqrdmlshq_v3.i +} + +define <4 x i32> @test_vqrdmlshq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmlshq_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.s[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> + %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4 + ret <4 x i32> %vqrdmlshq_v3.i +} + +define i16 @test_vqrdmlshh_s16(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: test_vqrdmlshh_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqrdmlsh v1.4h, v0.4h, v2.4h +; CHECK-NEXT: umov w0, v1.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = insertelement <4 x i16> undef, i16 %c, i64 0 + %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_vqrdmlshs_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqrdmlsh s1, s0, s2 +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: ret +entry: + %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4 + ret i32 %vqrdmlshs_s32.i +} + +define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlshh_lane_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqrdmlsh v2.4h, v1.4h, v0.h[3] +; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> + %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlshs_lane_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[1] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <2 x i32> %c, i64 1 + %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4 + ret i32 %vqrdmlshs_s32.i +} + +define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) { +; CHECK-LABEL: test_vqrdmlshh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqrdmlsh v2.4h, v1.4h, v0.h[7] +; CHECK-NEXT: umov w0, v2.h[0] +; CHECK-NEXT: ret +entry: + %0 = insertelement <4 x i16> undef, i16 %a, i64 0 + %1 = insertelement <4 x i16> undef, i16 %b, i64 0 + %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> + %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4 + %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0 + ret i16 %3 +} + +define i32 @test_vqrdmlshs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) { +; CHECK-LABEL: test_vqrdmlshs_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[3] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <4 x i32> %c, i64 3 + %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4 + ret i32 %vqrdmlshs_s32.i +} diff --git a/llvm/test/CodeGen/ARM/neon-v8.1a.ll b/llvm/test/CodeGen/ARM/neon-v8.1a.ll --- a/llvm/test/CodeGen/ARM/neon-v8.1a.ll +++ b/llvm/test/CodeGen/ARM/neon-v8.1a.ll @@ -19,10 +19,23 @@ declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) +declare <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) +declare <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) + +; The sadd intrinsics in this file previously transformed into sqrdmlah where they +; shouldn't. They should produce vqrdmulh and vadd. + define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmulah_v4i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2 +; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2 +; CHECK-NEXT: vqadd.s16 d0, d0, d16 ; CHECK-NEXT: bx lr %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) @@ -32,7 +45,8 @@ define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmulah_v8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlah.s16 q0, q1, q2 +; CHECK-NEXT: vqrdmulh.s16 q8, q1, q2 +; CHECK-NEXT: vqadd.s16 q0, q0, q8 ; CHECK-NEXT: bx lr %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) @@ -42,7 +56,8 @@ define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmulah_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2 +; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2 +; CHECK-NEXT: vqadd.s32 d0, d0, d16 ; CHECK-NEXT: bx lr %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) @@ -52,7 +67,8 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmulah_v4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlah.s32 q0, q1, q2 +; CHECK-NEXT: vqrdmulh.s32 q8, q1, q2 +; CHECK-NEXT: vqadd.s32 q0, q0, q8 ; CHECK-NEXT: bx lr %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) @@ -62,7 +78,8 @@ define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmulsh_v4i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2 +; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2 +; CHECK-NEXT: vqsub.s16 d0, d0, d16 ; CHECK-NEXT: bx lr %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod) @@ -72,7 +89,8 @@ define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { ; CHECK-LABEL: test_vqrdmulsh_v8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlsh.s16 q0, q1, q2 +; CHECK-NEXT: vqrdmulh.s16 q8, q1, q2 +; CHECK-NEXT: vqsub.s16 q0, q0, q8 ; CHECK-NEXT: bx lr %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod) @@ -82,7 +100,8 @@ define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmulsh_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2 +; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2 +; CHECK-NEXT: vqsub.s32 d0, d0, d16 ; CHECK-NEXT: bx lr %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod) @@ -92,7 +111,8 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { ; CHECK-LABEL: test_vqrdmulsh_v4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqrdmlsh.s32 q0, q1, q2 +; CHECK-NEXT: vqrdmulh.s32 q8, q1, q2 +; CHECK-NEXT: vqsub.s32 q0, q0, q8 ; CHECK-NEXT: bx lr %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) @@ -105,7 +125,8 @@ define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulah_lane_s16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2[3] +; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2[3] +; CHECK-NEXT: vqadd.s16 d0, d0, d16 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -118,7 +139,8 @@ ; CHECK-LABEL: test_vqrdmulahq_lane_s16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 -; CHECK-NEXT: vqrdmlah.s16 q0, q1, d4[2] +; CHECK-NEXT: vqrdmulh.s16 q8, q1, d4[2] +; CHECK-NEXT: vqadd.s16 q0, q0, q8 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> @@ -130,7 +152,8 @@ define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulah_lane_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2[1] +; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2[1] +; CHECK-NEXT: vqadd.s32 d0, d0, d16 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -143,7 +166,8 @@ ; CHECK-LABEL: test_vqrdmulahq_lane_s32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 -; CHECK-NEXT: vqrdmlah.s32 q0, q1, d4[0] +; CHECK-NEXT: vqrdmulh.s32 q8, q1, d4[0] +; CHECK-NEXT: vqadd.s32 q0, q0, q8 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer @@ -155,7 +179,8 @@ define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulsh_lane_s16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2[3] +; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2[3] +; CHECK-NEXT: vqsub.s16 d0, d0, d16 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -168,7 +193,8 @@ ; CHECK-LABEL: test_vqrdmulshq_lane_s16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 -; CHECK-NEXT: vqrdmlsh.s16 q0, q1, d4[2] +; CHECK-NEXT: vqrdmulh.s16 q8, q1, d4[2] +; CHECK-NEXT: vqsub.s16 q0, q0, q8 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> @@ -180,7 +206,8 @@ define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulsh_lane_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2[1] +; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2[1] +; CHECK-NEXT: vqsub.s32 d0, d0, d16 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -193,7 +220,8 @@ ; CHECK-LABEL: test_vqrdmulshq_lane_s32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 -; CHECK-NEXT: vqrdmlsh.s32 q0, q1, d4[0] +; CHECK-NEXT: vqrdmulh.s32 q8, q1, d4[0] +; CHECK-NEXT: vqsub.s32 q0, q0, q8 ; CHECK-NEXT: bx lr entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer @@ -201,3 +229,177 @@ %retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod) ret <4 x i32> %retval } + + + +define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlah_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlah_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlah_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #3 + ret <4 x i16> %vqrdmlah_v3.i +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlah_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlah_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlah_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #3 + ret <2 x i32> %vqrdmlah_v3.i +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vqrdmlahq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #3 + ret <8 x i16> %vqrdmlahq_v3.i +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vqrdmlahq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #3 + ret <4 x i32> %vqrdmlahq_v3.i +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlah_lane_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2[3] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <4 x i32> + %vqrdmlah_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #3 + ret <4 x i16> %vqrdmlah_v3.i +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlah_lane_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2[1] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <2 x i32> + %vqrdmlah_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #3 + ret <2 x i32> %vqrdmlah_v3.i +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlahq_lane_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 +; CHECK-NEXT: vqrdmlah.s16 q0, q1, d4[3] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> + %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #3 + ret <8 x i16> %vqrdmlahq_v3.i +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlahq_lane_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 +; CHECK-NEXT: vqrdmlah.s32 q0, q1, d4[1] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <4 x i32> + %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #3 + ret <4 x i32> %vqrdmlahq_v3.i +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlsh_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlsh_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #3 + ret <4 x i16> %vqrdmlsh_v3.i +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlsh_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlsh_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #3 + ret <2 x i32> %vqrdmlsh_v3.i +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlshq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vqrdmlshq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #3 + ret <8 x i16> %vqrdmlshq_v3.i +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlshq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vqrdmlshq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #3 + ret <4 x i32> %vqrdmlshq_v3.i +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlsh_lane_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2[3] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <4 x i32> + %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #3 + ret <4 x i16> %vqrdmlsh_v3.i +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlsh_lane_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2[1] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <2 x i32> + %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #3 + ret <2 x i32> %vqrdmlsh_v3.i +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: test_vqrdmlshq_lane_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 +; CHECK-NEXT: vqrdmlsh.s16 q0, q1, d4[3] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> + %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #3 + ret <8 x i16> %vqrdmlshq_v3.i +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: test_vqrdmlshq_lane_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2 +; CHECK-NEXT: vqrdmlsh.s32 q0, q1, d4[1] +; CHECK-NEXT: bx lr +entry: + %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <4 x i32> + %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #3 + ret <4 x i32> %vqrdmlshq_v3.i +}