Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4792,6 +4792,44 @@ defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>; +// Additional patterns for [SU]ML[AS]L +multiclass Neon_mul_acc_widen_patterns { + def : Pat<(v4i16 (opnode + V64:$Ra, + (v4i16 (extract_subvector + (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v8i16 (INST8B + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v2i32 (opnode + V64:$Ra, + (v2i32 (extract_subvector + (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v4i32 (INST4H + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v1i64 (opnode + V64:$Ra, + (v1i64 (extract_subvector + (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v2i64 (INST2S + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; +} + +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; + // Additional patterns for SMULL and UMULL multiclass Neon_mul_widen_patterns { Index: llvm/test/CodeGen/AArch64/mla_mls_merge.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/mla_mls_merge.ll @@ -0,0 +1,290 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla0(<8 x i8>* nocapture readonly %a, <8 x i8>* nocapture readonly %b, i16* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <8 x i8>, <8 x i8>* %a, align 8 + %1 = load <8 x i8>, <8 x i8>* %b, align 8 + %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %0, <8 x i8> %1) #2 + %arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8>* %a, i64 1 + %2 = load <8 x i8>, <8 x i8>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <8 x i8>, <8 x i8>* %b, i64 1 + %3 = load <8 x i8>, <8 x i8>* %arrayidx3, align 8 + %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %2, <8 x i8> %3) #2 + %add.i = add <8 x i16> %vmull.i.i, %vmull.i + %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> + %vrshr_n6 = tail call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %shuffle.i, <4 x i16> ) + %4 = bitcast i16* %dest to <4 x i16>* + store <4 x i16> %vrshr_n6, <4 x i16>* %4, align 2 + ret void +} +; CHECK-LABEL: test_mla0 +; CHECK: umlal v{{[0-9]+}}.8h + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla1(<8 x i8>* nocapture readonly %a, <8 x i8>* nocapture readonly %b, i16* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <8 x i8>, <8 x i8>* %a, align 8 + %1 = load <8 x i8>, <8 x i8>* %b, align 8 + %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %0, <8 x i8> %1) #2 + %arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8>* %a, i64 1 + %2 = load <8 x i8>, <8 x i8>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <8 x i8>, <8 x i8>* %b, i64 1 + %3 = load <8 x i8>, <8 x i8>* %arrayidx3, align 8 + %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %2, <8 x i8> %3) #2 + %add.i = add <8 x i16> %vmull.i.i, %vmull.i + %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> + %vrshr_n6 = tail call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %shuffle.i, <4 x i16> ) + %4 = bitcast i16* %dest to <4 x i16>* + store <4 x i16> %vrshr_n6, <4 x i16>* %4, align 2 + ret void +} +; CHECK-LABEL: test_mla1 +; CHECK: smlal v{{[0-9]+}}.8h + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla2(<4 x i16>* nocapture readonly %a, <4 x i16>* nocapture readonly %b, i32* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <4 x i16>, <4 x i16>* %a, align 8 + %1 = load <4 x i16>, <4 x i16>* %b, align 8 + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %0, <4 x i16> %1) #2 + %arrayidx2 = getelementptr inbounds <4 x i16>, <4 x i16>* %a, i64 1 + %2 = load <4 x i16>, <4 x i16>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <4 x i16>, <4 x i16>* %b, i64 1 + %3 = load <4 x i16>, <4 x i16>* %arrayidx3, align 8 + %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %2, <4 x i16> %3) #2 + %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i + %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> + %vrshr_n6 = tail call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %shuffle.i, <2 x i32> ) + %4 = bitcast i32* %dest to <2 x i32>* + store <2 x i32> %vrshr_n6, <2 x i32>* %4, align 4 + ret void +} +; CHECK-LABEL: test_mla2 +; CHECK: umlal v{{[0-9]+}}.4s + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla3(<4 x i16>* nocapture readonly %a, <4 x i16>* nocapture readonly %b, i32* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <4 x i16>, <4 x i16>* %a, align 8 + %1 = load <4 x i16>, <4 x i16>* %b, align 8 + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %0, <4 x i16> %1) #2 + %arrayidx2 = getelementptr inbounds <4 x i16>, <4 x i16>* %a, i64 1 + %2 = load <4 x i16>, <4 x i16>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <4 x i16>, <4 x i16>* %b, i64 1 + %3 = load <4 x i16>, <4 x i16>* %arrayidx3, align 8 + %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %2, <4 x i16> %3) #2 + %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i + %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> + %vrshr_n6 = tail call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %shuffle.i, <2 x i32> ) + %4 = bitcast i32* %dest to <2 x i32>* + store <2 x i32> %vrshr_n6, <2 x i32>* %4, align 4 + ret void +} +; CHECK-LABEL: test_mla3 +; CHECK: smlal v{{[0-9]+}}.4s + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla4(<2 x i32>* nocapture readonly %a, <2 x i32>* nocapture readonly %b, i64* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <2 x i32>, <2 x i32>* %a, align 8 + %1 = load <2 x i32>, <2 x i32>* %b, align 8 + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %0, <2 x i32> %1) #2 + %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32>* %a, i64 1 + %2 = load <2 x i32>, <2 x i32>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <2 x i32>, <2 x i32>* %b, i64 1 + %3 = load <2 x i32>, <2 x i32>* %arrayidx3, align 8 + %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %2, <2 x i32> %3) #2 + %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i + %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer + %vrshr_n6 = tail call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %shuffle.i, <1 x i64> ) + %4 = bitcast i64* %dest to <1 x i64>* + store <1 x i64> %vrshr_n6, <1 x i64>* %4, align 8 + ret void +} +; CHECK-LABEL: test_mla4 +; CHECK: umlal v{{[0-9]+}}.2d + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mla5(<2 x i32>* nocapture readonly %a, <2 x i32>* nocapture readonly %b, i64* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <2 x i32>, <2 x i32>* %a, align 8 + %1 = load <2 x i32>, <2 x i32>* %b, align 8 + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %0, <2 x i32> %1) #2 + %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32>* %a, i64 1 + %2 = load <2 x i32>, <2 x i32>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <2 x i32>, <2 x i32>* %b, i64 1 + %3 = load <2 x i32>, <2 x i32>* %arrayidx3, align 8 + %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %2, <2 x i32> %3) #2 + %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i + %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer + %vrshr_n6 = tail call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %shuffle.i, <1 x i64> ) + %4 = bitcast i64* %dest to <1 x i64>* + store <1 x i64> %vrshr_n6, <1 x i64>* %4, align 8 + ret void +} +; CHECK-LABEL: test_mla5 +; CHECK: smlal v{{[0-9]+}}.2d + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls0(<8 x i8>* nocapture readonly %a, <8 x i8>* nocapture readonly %b, i16* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <8 x i8>, <8 x i8>* %a, align 8 + %1 = load <8 x i8>, <8 x i8>* %b, align 8 + %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %0, <8 x i8> %1) #2 + %arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8>* %a, i64 1 + %2 = load <8 x i8>, <8 x i8>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <8 x i8>, <8 x i8>* %b, i64 1 + %3 = load <8 x i8>, <8 x i8>* %arrayidx3, align 8 + %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %2, <8 x i8> %3) #2 + %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i + %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> + %vrshr_n6 = tail call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %shuffle.i, <4 x i16> ) + %4 = bitcast i16* %dest to <4 x i16>* + store <4 x i16> %vrshr_n6, <4 x i16>* %4, align 2 + ret void +} +; CHECK-LABEL: test_mls0 +; CHECK: umlsl v{{[0-9]+}}.8h + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls1(<8 x i8>* nocapture readonly %a, <8 x i8>* nocapture readonly %b, i16* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <8 x i8>, <8 x i8>* %a, align 8 + %1 = load <8 x i8>, <8 x i8>* %b, align 8 + %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %0, <8 x i8> %1) #2 + %arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8>* %a, i64 1 + %2 = load <8 x i8>, <8 x i8>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <8 x i8>, <8 x i8>* %b, i64 1 + %3 = load <8 x i8>, <8 x i8>* %arrayidx3, align 8 + %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %2, <8 x i8> %3) #2 + %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i + %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> + %vrshr_n6 = tail call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %shuffle.i, <4 x i16> ) + %4 = bitcast i16* %dest to <4 x i16>* + store <4 x i16> %vrshr_n6, <4 x i16>* %4, align 2 + ret void +} +; CHECK-LABEL: test_mls1 +; CHECK: smlsl v{{[0-9]+}}.8h + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls2(<4 x i16>* nocapture readonly %a, <4 x i16>* nocapture readonly %b, i32* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <4 x i16>, <4 x i16>* %a, align 8 + %1 = load <4 x i16>, <4 x i16>* %b, align 8 + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %0, <4 x i16> %1) #2 + %arrayidx2 = getelementptr inbounds <4 x i16>, <4 x i16>* %a, i64 1 + %2 = load <4 x i16>, <4 x i16>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <4 x i16>, <4 x i16>* %b, i64 1 + %3 = load <4 x i16>, <4 x i16>* %arrayidx3, align 8 + %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %2, <4 x i16> %3) #2 + %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i + %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> + %vrshr_n6 = tail call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %shuffle.i, <2 x i32> ) + %4 = bitcast i32* %dest to <2 x i32>* + store <2 x i32> %vrshr_n6, <2 x i32>* %4, align 4 + ret void +} +; CHECK-LABEL: test_mls2 +; CHECK: umlsl v{{[0-9]+}}.4s + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls3(<4 x i16>* nocapture readonly %a, <4 x i16>* nocapture readonly %b, i32* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <4 x i16>, <4 x i16>* %a, align 8 + %1 = load <4 x i16>, <4 x i16>* %b, align 8 + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %0, <4 x i16> %1) #2 + %arrayidx2 = getelementptr inbounds <4 x i16>, <4 x i16>* %a, i64 1 + %2 = load <4 x i16>, <4 x i16>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <4 x i16>, <4 x i16>* %b, i64 1 + %3 = load <4 x i16>, <4 x i16>* %arrayidx3, align 8 + %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %2, <4 x i16> %3) #2 + %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i + %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> + %vrshr_n6 = tail call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %shuffle.i, <2 x i32> ) + %4 = bitcast i32* %dest to <2 x i32>* + store <2 x i32> %vrshr_n6, <2 x i32>* %4, align 4 + ret void +} +; CHECK-LABEL: test_mls3 +; CHECK: smlsl v{{[0-9]+}}.4s + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls4(<2 x i32>* nocapture readonly %a, <2 x i32>* nocapture readonly %b, i64* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <2 x i32>, <2 x i32>* %a, align 8 + %1 = load <2 x i32>, <2 x i32>* %b, align 8 + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %0, <2 x i32> %1) #2 + %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32>* %a, i64 1 + %2 = load <2 x i32>, <2 x i32>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <2 x i32>, <2 x i32>* %b, i64 1 + %3 = load <2 x i32>, <2 x i32>* %arrayidx3, align 8 + %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %2, <2 x i32> %3) #2 + %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i + %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer + %vrshr_n6 = tail call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %shuffle.i, <1 x i64> ) + %4 = bitcast i64* %dest to <1 x i64>* + store <1 x i64> %vrshr_n6, <1 x i64>* %4, align 8 + ret void +} +; CHECK-LABEL: test_mls4 +; CHECK: umlsl v{{[0-9]+}}.2d + +; Function Attrs: nofree nounwind uwtable +define dso_local void @test_mls5(<2 x i32>* nocapture readonly %a, <2 x i32>* nocapture readonly %b, i64* nocapture %dest) local_unnamed_addr { +entry: + %0 = load <2 x i32>, <2 x i32>* %a, align 8 + %1 = load <2 x i32>, <2 x i32>* %b, align 8 + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %0, <2 x i32> %1) #2 + %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32>* %a, i64 1 + %2 = load <2 x i32>, <2 x i32>* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds <2 x i32>, <2 x i32>* %b, i64 1 + %3 = load <2 x i32>, <2 x i32>* %arrayidx3, align 8 + %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %2, <2 x i32> %3) #2 + %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i + %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer + %vrshr_n6 = tail call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %shuffle.i, <1 x i64> ) + %4 = bitcast i64* %dest to <1 x i64>* + store <1 x i64> %vrshr_n6, <1 x i64>* %4, align 8 + ret void +} +; CHECK-LABEL: test_mls5 +; CHECK: smlsl v{{[0-9]+}}.2d + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) +