diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15147,6 +15147,11 @@ // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { + MVT VT = N.getSimpleValueType(); + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N.getConstantOperandVal(1) == 0) + N = N.getOperand(0); + switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: @@ -15167,18 +15172,19 @@ return SDValue(); } - MVT NarrowTy = N.getSimpleValueType(); - if (!NarrowTy.is64BitVector()) + if (!VT.is64BitVector()) return SDValue(); - MVT ElementTy = NarrowTy.getVectorElementType(); - unsigned NumElems = NarrowTy.getVectorNumElements(); - MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); + SDLoc DL(N); + unsigned NumElems = VT.getVectorNumElements(); + if (N.getValueType().is64BitVector()) { + MVT ElementTy = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); + N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops()); + } - SDLoc dl(N); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, - DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), - DAG.getConstant(NumElems, dl, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N, + DAG.getConstant(NumElems, DL, MVT::i64)); } static bool isEssentiallyExtractHighSubvector(SDValue N) { @@ -18225,6 +18231,24 @@ return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } +static SDValue performDUPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the + // 128bit vector version. + if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { + EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); + if (SDNode *LN = DCI.DAG.getNodeIfExists( + N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SDLoc DL(N); + return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), + DCI.DAG.getConstant(0, DL, MVT::i64)); + } + } + + return performPostLD1Combine(N, DCI, false); +} + /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -18948,7 +18972,7 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: - return performPostLD1Combine(N, DCI, false); + return performDUPCombine(N, DCI); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -201,22 +201,21 @@ ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: // %vector.ph ; CHECK-NEXT: and x10, x9, #0xfffffff0 -; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: add x11, x2, #32 ; CHECK-NEXT: add x12, x0, #16 ; CHECK-NEXT: mov x13, x10 -; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: .LBB3_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q3, [x12, #-16] +; CHECK-NEXT: ldp q1, q2, [x12, #-16] ; CHECK-NEXT: subs x13, x13, #16 ; CHECK-NEXT: add x12, x12, #32 -; CHECK-NEXT: smull2 v4.4s, v1.8h, v2.8h +; CHECK-NEXT: smull2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: smull2 v4.4s, v0.8h, v2.8h ; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h -; CHECK-NEXT: smull2 v5.4s, v1.8h, v3.8h -; CHECK-NEXT: smull v3.4s, v0.4h, v3.4h -; CHECK-NEXT: stp q2, q4, [x11, #-32] -; CHECK-NEXT: stp q3, q5, [x11], #64 +; CHECK-NEXT: stp q1, q3, [x11, #-32] +; CHECK-NEXT: stp q2, q4, [x11], #64 ; CHECK-NEXT: b.ne .LBB3_4 ; CHECK-NEXT: // %bb.5: // %middle.block ; CHECK-NEXT: cmp x10, x9 @@ -314,22 +313,21 @@ ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: // %vector.ph ; CHECK-NEXT: and x10, x9, #0xfffffff0 -; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: add x11, x2, #32 ; CHECK-NEXT: add x12, x0, #16 ; CHECK-NEXT: mov x13, x10 -; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: .LBB4_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q3, [x12, #-16] +; CHECK-NEXT: ldp q1, q2, [x12, #-16] ; CHECK-NEXT: subs x13, x13, #16 ; CHECK-NEXT: add x12, x12, #32 -; CHECK-NEXT: umull2 v4.4s, v1.8h, v2.8h +; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h -; CHECK-NEXT: umull2 v5.4s, v1.8h, v3.8h -; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h -; CHECK-NEXT: stp q2, q4, [x11, #-32] -; CHECK-NEXT: stp q3, q5, [x11], #64 +; CHECK-NEXT: stp q1, q3, [x11, #-32] +; CHECK-NEXT: stp q2, q4, [x11], #64 ; CHECK-NEXT: b.ne .LBB4_4 ; CHECK-NEXT: // %bb.5: // %middle.block ; CHECK-NEXT: cmp x10, x9