diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8906,8 +8906,15 @@ } def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, - FPR32Op, FPR16Op, V128_lo, VectorIndexH, - asm, ".h", "", "", ".h", []> { + FPR32Op, FPR32Op, V64, VectorIndexH, + asm, ".h", "", "", ".h", + [(set (i32 FPR32Op:$dst), + (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (int_aarch64_neon_sqdmull + (v4i16 (scalar_to_vector (i32 FPR32Op:$Rn))), + (v4i16 V64:$Rm))), + VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -2894,6 +2894,23 @@ ret <1 x double> %prod } +define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind { +; CHECK-LABEL: sqdmlal_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w2 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqdmlal.h s1, s2, v0[0] +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: ret + %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 + %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 + %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = extractelement <4 x i32> %tmp3, i64 0 + %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4) + ret i32 %tmp5 +} + define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { ; CHECK-LABEL: sqdmlal_d: ; CHECK: // %bb.0: @@ -2908,6 +2925,23 @@ ret i64 %tmp5 } +define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind { +; CHECK-LABEL: sqdmlsl_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w2 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sqdmlsl.h s1, s2, v0[0] +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: ret + %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 + %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 + %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = extractelement <4 x i32> %tmp3, i64 0 + %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4) + ret i32 %tmp5 +} + define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { ; CHECK-LABEL: sqdmlsl_d: ; CHECK: // %bb.0: