Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12212,6 +12212,43 @@ return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } +// ADD(UADDV a, UADDV b) --> UADDV((ADD a, b)) +static SDValue performUADDVCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + // TODO: Currently handled for 32 bit integer vectors. + if (N->getOpcode() != ISD::ADD || VT != MVT::i32) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) + return SDValue(); + + auto *LHSN1 = dyn_cast(LHS->getOperand(1)); + auto *RHSN1 = dyn_cast(RHS->getOperand(1)); + if (!LHSN1 || !RHSN1 || !LHSN1->isNullValue() || !RHSN1->isNullValue()) + return SDValue(); + + SDValue Op1 = LHS->getOperand(0); + SDValue Op2 = RHS->getOperand(0); + if (Op1.getOpcode() != AArch64ISD::UADDV || + Op1.getValueType() != MVT::v4i32 || + Op2.getOpcode() != AArch64ISD::UADDV || Op2.getValueType() != MVT::v4i32) + return SDValue(); + + SDValue Val1 = Op1.getOperand(0); + SDValue Val2 = Op2.getOperand(0); + EVT ValVT = Val1->getValueType(0); + SDLoc DL(N); + SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), + DAG.getConstant(0, DL, MVT::i64)); +} + // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: @@ -12265,6 +12302,18 @@ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } +static SDValue performAddSubCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Try to change sum of two reductions. + SDValue Val = performUADDVCombine(N, DCI, DAG); + if (Val.getNode()) { + return Val; + } + + return performAddSubLongCombine(N, DCI, DAG); +} + // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // @@ -14613,7 +14662,7 @@ break; case ISD::ADD: case ISD::SUB: - return performAddSubLongCombine(N, DCI, DAG); + return performAddSubCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: Index: llvm/test/CodeGen/AArch64/aarch64-addv.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -68,3 +68,18 @@ %r = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) ret i32 %r } + +define i32 @addv_combine(i32* nocapture readonly %a1, i32* nocapture readonly %a2) { +; CHECK-LABEL: addv_combine +; CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s +entry: + %0 = bitcast i32* %a1 to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0 + %2 = bitcast i32* %a2 to <4 x i32>* + %3 = load <4 x i32>, <4 x i32>* %2 + %4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %1) + %5 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %3) + %6 = add i32 %4, %5 + ret i32 %6 +}