Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -490,6 +490,7 @@ setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -8564,6 +8565,81 @@ return SDValue(); } +/// Target-specific DAG combine for the across vector add reduction (addv). +/// Example: +/// ext v1.16b, v0.16b, v0.16b, #8 +/// add v0.4s, v1.4s, v0.4s +/// dup v1.4s, v0.s[1] +/// add v0.4s, v1.4s, v0.4s +/// becomes: +/// addv s0, v0.4s +static SDValue +performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + // If the input vector is not an ADD, can't do such combine. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // Vector extract idx must constant zero. + if (!isa(N1) || cast(N1)->getZExtValue()) + return SDValue(); + + EVT EltTy = N0.getValueType().getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + unsigned NumVecElts = N0.getValueType().getVectorNumElements(); + unsigned NumMaxSubAddElts = NumVecElts / 2; + unsigned NumAddElts = 1; + SDValue InputADD = N0; + // Iterate over each step of the reduction. + while (NumAddElts <= NumMaxSubAddElts) { + if (InputADD.getOpcode() != ISD::ADD) + return SDValue(); + SDValue ADD = InputADD.getOperand(0); + SDValue SV = InputADD.getOperand(1); + if (SV.getOpcode() != ISD::VECTOR_SHUFFLE) { + ADD = InputADD.getOperand(1); + SV = InputADD.getOperand(0); + if (SV.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + // Check if this is one step of addition reduction. + // E.g., + // %add = add %1, %0 + // %svn = vector_shuffle %add, <2, 3, u, u> + // %inputadd = add %add, %svn + if (SV.getOperand(0) != ADD) + return SDValue(); + + ShuffleVectorSDNode *SVN = cast(SV); + ArrayRef Mask = SVN->getMask(); + // Check the mask value in each step of sub-addition. + // E.g., for the add reduction for <8 xi16> vector, + // mask values in each sub-addition : + // step 3 : <4,5,6,7,u,u,u,u> + // step 2 : <2,3,u,u,u,u,u,u> + // step 1 : <1,u,u,u,u,u,u,u> + for (unsigned int i = 0; i < NumVecElts; ++i) + if ((i >= NumAddElts && Mask[i] >= 0) || + (i < NumAddElts && + static_cast(Mask[i]) != (NumAddElts + i))) + return SDValue(); + // Move the next step. + InputADD = ADD; + NumAddElts = NumAddElts << 1; + } + SDLoc DL(N); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(AArch64ISD::UADDV, DL, + InputADD.getSimpleValueType(), InputADD), + DAG.getConstant(0, DL, MVT::i64)); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -9158,6 +9234,8 @@ return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { Index: test/CodeGen/AArch64/aarch64-addv.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-addv.ll @@ -0,0 +1,56 @@ +; RUN: llc -march=aarch64 < %s | FileCheck %s + +define i8 @f_v16i8(<16 x i8>* %arr) { +; CHECK-LABEL: f_v16i8 +; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + %bin.rdx = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> + %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0 + %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> + %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> + %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> + %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <16 x i8> %bin.rdx14, i32 0 + ret i8 %r +} + + +define i16 @f_v8i16(<8 x i16>* %arr) { +; CHECK-LABEL: f_v8i16 +; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + %bin.rdx = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> + %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> + %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i16> %bin.rdx14, i32 0 + ret i16 %r +} + + +define i32 @f_v4i32( <4 x i32>* %arr) { +; CHECK-LABEL: f_v4i32 +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + %bin.rdx = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> + %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12 + %r = extractelement <4 x i32> %bin.rdx13, i32 0 + ret i32 %r +} + + +define i64 @f_v2i64(<2 x i64>* %arr) { +; CHECK-LABEL: f_v2i64 +; CHECK-NOT: addv + %bin.rdx = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> + %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0 + %r = extractelement <2 x i64> %bin.rdx0, i32 0 + ret i64 %r +}