Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8847,20 +8847,6 @@ let Inst{20} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an - // intermediate EXTRACT_SUBREG would be untyped. - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract (v4i32 - (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx)))), - (i64 0))))), - (EXTRACT_SUBREG - (!cast(NAME # v4i16_indexed) - (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn, - V128_lo:$Rm, VectorIndexH:$idx), - ssub)>; - def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, @@ -8914,6 +8900,31 @@ let Inst{20} = idx{0}; } + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (int_aarch64_neon_sqdmull + (v4i16 V64:$Rn), + (v4i16 V64:$Rm))), + (i64 0))))), + (!cast(NAME # v1i32_indexed) + FPR32Op:$Rd, + (EXTRACT_SUBREG V64:$Rn, hsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), + (i64 0))>; + + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (int_aarch64_neon_sqdmull + (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 + (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))), + (i64 0))))), + (!cast(NAME # v1i32_indexed) + FPR32Op:$Rd, + (EXTRACT_SUBREG V64:$Rn, hsub), + V128_lo:$Rm, + VectorIndexH:$idx)>; def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR64Op, FPR32Op, V128, VectorIndexS, Index: llvm/test/CodeGen/AArch64/arm64-vmul.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1619,7 +1619,7 @@ ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlal.4s v2, v1, v0[1] +; CHECK-NEXT: sqdmlal.h s2, h1, v0[1] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 @@ -1637,7 +1637,7 @@ ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: fmov s2, w0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlsl.4s v2, v1, v0[1] +; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1] ; CHECK-NEXT: fmov w0, s2 ; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 @@ -1649,6 +1649,38 @@ } declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) +define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { +; CHECK-LABEL: sqadd_lane1_sqdmull4s: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmull.4s v0, v0, v1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: mov.s w8, v0[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: sqadd s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) + %prod = extractelement <4 x i32> %prod.vec, i32 1 + %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) + ret i32 %res +} + +define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { +; CHECK-LABEL: sqsub_lane1_sqdmull4s: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmull.4s v0, v0, v1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: mov.s w8, v0[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: sqsub s0, s1, s0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) + %prod = extractelement <4 x i32> %prod.vec, i32 1 + %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) + ret i32 %res +} + define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_1d: ; CHECK: // %bb.0: @@ -2894,6 +2926,23 @@ ret <1 x double> %prod } +define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind { +; CHECK-LABEL: sqdmlal_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqdmlal.h s2, h1, v0[0] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret + %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 + %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 + %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = extractelement <4 x i32> %tmp3, i64 0 + %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4) + ret i32 %tmp5 +} + define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { ; CHECK-LABEL: sqdmlal_d: ; CHECK: // %bb.0: @@ -2908,6 +2957,23 @@ ret i64 %tmp5 } +define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind { +; CHECK-LABEL: sqdmlsl_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w2 +; CHECK-NEXT: sqdmlsl.h s2, h1, v0[0] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret + %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 + %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 + %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = extractelement <4 x i32> %tmp3, i64 0 + %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4) + ret i32 %tmp5 +} + define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { ; CHECK-LABEL: sqdmlsl_d: ; CHECK: // %bb.0: