Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -9420,6 +9420,48 @@ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); } +static SDValue AddCombineVUZPToVPADD(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. + if (!N->getValueType(0).is64BitVector()) + return SDValue(); + + // Check for ADD(EXTR_SUBV(VUZP.0), EXTR_SUBV(VUZP.1)). + if (!(N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOpcode() == ISD::EXTRACT_SUBVECTOR)) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + + if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || + N00 == N10) + return SDValue(); + + // Generate vpadd with the right subvectors. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc dl(N); + SDNode *Unzip = N00.getNode(); + EVT VT = N->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + + SDValue extract0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + Unzip->getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue extract1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + Unzip->getOperand(0), + DAG.getIntPtrConstant(NumElts, dl)); + SmallVector Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + Ops.push_back(extract0); + Ops.push_back(extract1); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); +} + static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -9926,6 +9968,9 @@ if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) return Result; + if (SDValue Result = AddCombineVUZPToVPADD(N, N0, N1, DCI, Subtarget)) + return Result; + // Attempt to create vpaddl for this add. if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; Index: test/CodeGen/ARM/vpadd.ll =================================================================== --- test/CodeGen/ARM/vpadd.ll +++ test/CodeGen/ARM/vpadd.ll @@ -385,6 +385,26 @@ ret void } +; PR32999: combine vuzp+add->vpadd +define void @pr32999(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { +; CHECK-LABEL: pr32999: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vpadd.i16 d16, d16, d17 +; CHECK-NEXT: vstr d16, [r1] +; CHECK-NEXT: mov pc, lr + %tmp = load <16 x i8>, <16 x i8>* %cbcr + %tmp1 = zext <16 x i8> %tmp to <16 x i16> + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> + %tmp2a = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> + %tmp3a = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <4 x i32> + %add = add <4 x i16> %tmp2a, %tmp3a + store <4 x i16> %add, <4 x i16>* %X, align 8 + ret void +} + ; Combine vuzp+vaddl->vpaddl define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_s16: