diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15447,7 +15447,11 @@ return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: + return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_umull: + return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_pmull: case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); @@ -18131,6 +18135,9 @@ return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case AArch64ISD::UADDV: return performUADDVCombine(N, DAG); + case AArch64ISD::SMULL: + case AArch64ISD::UMULL: + return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5105,10 +5105,10 @@ defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>; defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", @@ -5126,10 +5126,10 @@ defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", @@ -5164,74 +5164,15 @@ V64:$Rn, V64:$Rm)), dsub)>; } -defm : Neon_mul_acc_widen_patterns; -defm : Neon_mul_acc_widen_patterns; -defm : Neon_mul_acc_widen_patterns; -defm : Neon_mul_acc_widen_patterns; -// Additional patterns for SMULL and UMULL -multiclass Neon_mul_widen_patterns { - def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))), - (INST8B V64:$Rn, V64:$Rm)>; - def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))), - (INST4H V64:$Rn, V64:$Rm)>; - def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))), - (INST2S V64:$Rn, V64:$Rm)>; -} - -defm : Neon_mul_widen_patterns; -defm : Neon_mul_widen_patterns; - -// Patterns for smull2/umull2. -multiclass Neon_mul_high_patterns { - def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))), - (INST8B V128:$Rn, V128:$Rm)>; - def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm))), - (INST4H V128:$Rn, V128:$Rm)>; - def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm))), - (INST2S V128:$Rn, V128:$Rm)>; -} - -defm : Neon_mul_high_patterns; -defm : Neon_mul_high_patterns; - -// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL -multiclass Neon_mulacc_widen_patterns { - def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), - (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>; - def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))), - (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>; - def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))), - (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>; -} - -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, - SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, - UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, - SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, - UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; - // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; @@ -6404,11 +6345,10 @@ defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", - int_aarch64_neon_smull>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>; defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", @@ -6419,11 +6359,10 @@ int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", - int_aarch64_neon_umull>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>; // A scalar sqdmull with the second operand being a vector lane can be // handled directly with the indexed instruction encoding. diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll --- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll @@ -71,12 +71,10 @@ define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) { ; CHECK-LABEL: mla_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: umlal2 v3.8h, v0.16b, v1.16b ; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b -; CHECK-NEXT: umlal v3.8h, v4.8b, v5.8b -; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %ea = zext <16 x i8> %a to <16 x i16> @@ -91,18 +89,14 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll v6.8h, v0.8b, #0 ; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v7.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8 -; CHECK-NEXT: ext v17.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: ext v19.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: umlal v4.4s, v0.4h, v1.4h -; CHECK-NEXT: umlal v2.4s, v6.4h, v7.4h -; CHECK-NEXT: umlal v3.4s, v16.4h, v18.4h -; CHECK-NEXT: umlal v5.4s, v17.4h, v19.4h -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ushll2 v7.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: umlal2 v5.4s, v0.8h, v7.8h +; CHECK-NEXT: umlal2 v3.4s, v6.8h, v1.8h +; CHECK-NEXT: umlal v2.4s, v6.4h, v1.4h +; CHECK-NEXT: umlal v4.4s, v0.4h, v7.4h ; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: mov v2.16b, v4.16b ; CHECK-NEXT: mov v3.16b, v5.16b ; CHECK-NEXT: ret @@ -117,43 +111,35 @@ define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) { ; CHECK-LABEL: mla_i64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v17.16b, v7.16b +; CHECK-NEXT: mov v16.16b, v6.16b +; CHECK-NEXT: ldp q6, q7, [sp] ; CHECK-NEXT: ushll v18.8h, v0.8b, #0 ; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v25.8h, v1.8b, #0 +; CHECK-NEXT: ushll v21.8h, v1.8b, #0 ; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-NEXT: ushll v19.4s, v18.4h, #0 ; CHECK-NEXT: ushll v20.4s, v0.4h, #0 ; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0 -; CHECK-NEXT: ushll v26.4s, v25.4h, #0 -; CHECK-NEXT: ushll v27.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: mov v16.16b, v7.16b -; CHECK-NEXT: mov v17.16b, v6.16b -; CHECK-NEXT: ldp q6, q7, [sp] +; CHECK-NEXT: ushll v22.4s, v21.4h, #0 +; CHECK-NEXT: ushll v23.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v21.4s, v21.8h, #0 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ext v21.16b, v19.16b, v19.16b, #8 -; CHECK-NEXT: ext v22.16b, v20.16b, v20.16b, #8 -; CHECK-NEXT: ext v23.16b, v18.16b, v18.16b, #8 -; CHECK-NEXT: ext v28.16b, v26.16b, v26.16b, #8 -; CHECK-NEXT: ext v29.16b, v27.16b, v27.16b, #8 -; CHECK-NEXT: ext v30.16b, v25.16b, v25.16b, #8 -; CHECK-NEXT: ext v24.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v31.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: umlal v4.2d, v18.2s, v25.2s -; CHECK-NEXT: umlal v17.2d, v20.2s, v27.2s -; CHECK-NEXT: umlal v2.2d, v19.2s, v26.2s -; CHECK-NEXT: umlal v3.2d, v21.2s, v28.2s -; CHECK-NEXT: umlal v5.2d, v23.2s, v30.2s -; CHECK-NEXT: umlal v16.2d, v22.2s, v29.2s +; CHECK-NEXT: umlal2 v5.2d, v18.4s, v21.4s +; CHECK-NEXT: umlal2 v17.2d, v20.4s, v23.4s +; CHECK-NEXT: umlal2 v3.2d, v19.4s, v22.4s +; CHECK-NEXT: umlal v2.2d, v19.2s, v22.2s +; CHECK-NEXT: umlal v4.2d, v18.2s, v21.2s +; CHECK-NEXT: umlal v16.2d, v20.2s, v23.2s +; CHECK-NEXT: umlal2 v7.2d, v0.4s, v1.4s ; CHECK-NEXT: umlal v6.2d, v0.2s, v1.2s -; CHECK-NEXT: umlal v7.2d, v24.2s, v31.2s ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: mov v1.16b, v3.16b ; CHECK-NEXT: mov v2.16b, v4.16b ; CHECK-NEXT: mov v3.16b, v5.16b -; CHECK-NEXT: mov v4.16b, v17.16b -; CHECK-NEXT: mov v5.16b, v16.16b +; CHECK-NEXT: mov v4.16b, v16.16b +; CHECK-NEXT: mov v5.16b, v17.16b ; CHECK-NEXT: ret entry: %ea = zext <16 x i8> %a to <16 x i64>