diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12336,6 +12336,43 @@ return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } +// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) +static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + // Only scalar integer and vector types. + if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) + return SDValue(); + + auto *LHSN1 = dyn_cast(LHS->getOperand(1)); + auto *RHSN1 = dyn_cast(RHS->getOperand(1)); + if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue()) + return SDValue(); + + SDValue Op1 = LHS->getOperand(0); + SDValue Op2 = RHS->getOperand(0); + EVT OpVT1 = Op1.getValueType(); + EVT OpVT2 = Op2.getValueType(); + if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || + Op2.getOpcode() != AArch64ISD::UADDV || + OpVT1.getVectorElementType() != VT) + return SDValue(); + + SDValue Val1 = Op1.getOperand(0); + SDValue Val2 = Op2.getOperand(0); + EVT ValVT = Val1->getValueType(0); + SDLoc DL(N); + SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), + DAG.getConstant(0, DL, MVT::i64)); +} + // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: @@ -12389,6 +12426,16 @@ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } +static SDValue performAddSubCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Try to change sum of two reductions. + if (SDValue Val = performUADDVCombine(N, DAG)) + return Val; + + return performAddSubLongCombine(N, DCI, DAG); +} + // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // @@ -14739,7 +14786,7 @@ return performABSCombine(N, DAG, DCI, Subtarget); case ISD::ADD: case ISD::SUB: - return performAddSubLongCombine(N, DCI, DAG); + return performAddSubCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -138,11 +138,9 @@ define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: addv_combine_i32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: addv s1, v1.4s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1) @@ -154,11 +152,9 @@ define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { ; CHECK-LABEL: addv_combine_i64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1)