Index: CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -130,6 +130,11 @@ SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); + SDValue ExpandIDIV(SDValue Op); + + /// Helper Function for expanding idiv using fp div + SDValue ExpandIDivUsingFPDiv(bool isSigned, SDLoc dl, + EVT FPVT, EVT IVT, SDValue A, SDValue B); /// \brief Implements vector promotion. /// @@ -698,11 +703,73 @@ return ExpandCTLZ(Op); case ISD::CTTZ_ZERO_UNDEF: return ExpandCTTZ_ZERO_UNDEF(Op); + case ISD::SDIV: + case ISD::UDIV: + return ExpandIDIV(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } } +SDValue VectorLegalizer::ExpandIDivUsingFPDiv(bool isSigned, SDLoc dl, + EVT FPVT, EVT IVT, SDValue A, SDValue B) { + unsigned Opcode = isSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + SDValue LHS = DAG.getNode(Opcode, dl, FPVT, A); + SDValue RHS = DAG.getNode(Opcode, dl, FPVT, B); + + SDValue Res = DAG.getNode(ISD::FDIV, dl, FPVT, LHS, RHS); + Opcode = isSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + return DAG.getNode(Opcode, dl, IVT, Res); +} + +SDValue VectorLegalizer::ExpandIDIV(SDValue Op) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + assert(VT.isVector() && "unexpected type"); + unsigned int NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + if (NumElts == 1) + return DAG.UnrollVectorOp(Op.getNode()); + + if (!TLI.isDesirableToExpandIDivWithFPDiv(VT)) + return DAG.UnrollVectorOp(Op.getNode()); + + EVT FloatVT; + if (EltVT == MVT::i32) + FloatVT = MVT::f64; + else if (EltVT == MVT::i16) + FloatVT = MVT::f32; + else + // in any other case go ahead and unroll + return DAG.UnrollVectorOp(Op.getNode()); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), FloatVT, NumElts); + + bool isSigned = (Op.getOpcode() == ISD::SDIV); + + if (!TLI.isOperationLegalOrCustom(ISD::FDIV, DstVT)) { + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts / 2); + DstVT = EVT::getVectorVT(*DAG.getContext(), FloatVT, NumElts); + + SDValue Lo = DAG.getIntPtrConstant(0, dl); + SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl); + SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, LHS, Lo); + SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, RHS, Lo); + SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, LHS, Hi); + SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, RHS, Hi); + + SDValue ResLo = ExpandIDivUsingFPDiv(isSigned, dl, DstVT, HalfVT, ALo, BLo); + SDValue ResHi = ExpandIDivUsingFPDiv(isSigned, dl, DstVT, HalfVT, AHi, BHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, ResLo, ResHi); + } + return ExpandIDivUsingFPDiv(isSigned, dl, DstVT, VT, LHS, RHS); +} + SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { // Lower a select instruction where the condition is a scalar and the // operands are vectors. Lower this select to VSELECT and implement it Index: CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2700,6 +2700,7 @@ bool nuw = false; bool nsw = false; bool exact = false; + bool nof = false; bool vec_redux = false; FastMathFlags FMF; @@ -2711,6 +2712,9 @@ if (const PossiblyExactOperator *ExactOp = dyn_cast(&I)) exact = ExactOp->isExact(); + if (const PossiblyOverflowOperator *OverflowOp = + dyn_cast(&I)) + nof = OverflowOp->isNoOverflow(); if (const FPMathOperator *FPOp = dyn_cast(&I)) FMF = FPOp->getFastMathFlags(); @@ -2721,6 +2725,7 @@ SDNodeFlags Flags; Flags.setExact(exact); + Flags.setNOF(nof); Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); Flags.setVectorReduction(vec_redux); @@ -2796,6 +2801,8 @@ SDNodeFlags Flags; Flags.setExact(isa(&I) && cast(&I)->isExact()); + Flags.setNOF(isa(&I) && + cast(&I)->isNoOverflow()); setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, Op2, Flags)); } Index: CodeGen/X86/combine-sdiv.ll =================================================================== --- CodeGen/X86/combine-sdiv.ll +++ CodeGen/X86/combine-sdiv.ll @@ -123,30 +123,38 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: vmovd %xmm0, %esi -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %esi -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: cltd +; AVX1-NEXT: idivl %ecx +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: cltd +; AVX1-NEXT: idivl %esi +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: cltd +; AVX1-NEXT: idivl %ecx +; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: cltd +; AVX1-NEXT: idivl %ecx +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_sdiv_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = sdiv nof <4 x i32> %x, %x ret <4 x i32> %1 } @@ -254,30 +262,41 @@ ; SSE-NEXT: pinsrd $3, %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_by_pow2b: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $30, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $29, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $3, %ecx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $28, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $4, %ecx -; AVX-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 -; AVX-NEXT: retq +; FIX IT: X86 support variable shift of vectors need to catch this +; in DAGCombine +; +; AVX1-LABEL: combine_vec_sdiv_by_pow2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $30, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $2, %ecx +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $29, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $3, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $28, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $4, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_sdiv_by_pow2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vdivpd .LCPI13_0(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = sdiv nof <4 x i32> %x, ret <4 x i32> %1 } Index: CodeGen/X86/combine-udiv.ll =================================================================== --- CodeGen/X86/combine-udiv.ll +++ CodeGen/X86/combine-udiv.ll @@ -91,26 +91,53 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; FIXME: uint to double conversion is not effecient GCC has better sequence +; +; AVX2-LABEL: combine_vec_udiv_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd .LCPI7_0(%rip), %ymm2 # ymm2 = [65536,65536,65536,65536] +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = udiv nof <4 x i32> %x, %x ret <4 x i32> %1 } Index: Target/X86/X86ISelLowering.h =================================================================== --- Target/X86/X86ISelLowering.h +++ Target/X86/X86ISelLowering.h @@ -752,6 +752,10 @@ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + /// Return true if it is profitable to expand integer division using + /// floating point division instruction + bool isDesirableToExpandIDivWithFPDiv(EVT VT) const override; + // Return true if it is profitable to combine a BUILD_VECTOR with a // stride-pattern to a shuffle and a truncate. // Example of such a combine: Index: Target/X86/X86ISelLowering.cpp =================================================================== --- Target/X86/X86ISelLowering.cpp +++ Target/X86/X86ISelLowering.cpp @@ -1094,6 +1094,17 @@ setOperationAction(ISD::MSTORE, VT, Legal); } + if (Subtarget.hasAVX2()) { + for (auto VT: {MVT::v8i32, MVT::v16i16}) { + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + } + for (auto VT: {MVT::v4i32, MVT::v8i16}) { + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + } + } + // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, @@ -1204,6 +1215,17 @@ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + if (Subtarget.hasAVX512()) { + for (auto VT: {MVT::v8i32, MVT::v16i16}) { + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + } + for (auto VT: {MVT::v16i32, MVT::v32i16}) { + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + } + } + for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); @@ -38075,6 +38097,22 @@ return Promote; } +bool X86TargetLowering::isDesirableToExpandIDivWithFPDiv(EVT VT) + const { + // not profitable to convert scalar idiv to fpdiv + if (!VT.isVector()) + return false; + + unsigned NumElements = VT.getVectorNumElements(); + + // in case of small vector still not worthy to do that + if (NumElements <= 2) + return false; + + // we prefer this only in case of AVX2\AVX512 + return (Subtarget.hasAVX2() || Subtarget.hasAVX512()); +} + bool X86TargetLowering:: isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const { Index: Target/X86/X86TargetTransformInfo.cpp =================================================================== --- Target/X86/X86TargetTransformInfo.cpp +++ Target/X86/X86TargetTransformInfo.cpp @@ -2535,6 +2535,14 @@ } bool X86TTIImpl::isLegalMayOverflowUDiv(Type *DataType) { + assert(DataType && "datatype is null"); + if (ST->hasAVX512() || ST->hasAVX2()) { + if (DataType->isVectorTy()) { + Type *ElmType = DataType->getVectorElementType(); + unsigned BW = ElmType->getIntegerBitWidth(); + return ElmType->isIntegerTy() && (BW == 16 || BW == 32); + } + } return false; } Index: llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/CodeGen/SelectionDAGNodes.h +++ llvm/CodeGen/SelectionDAGNodes.h @@ -364,6 +364,7 @@ bool AllowReciprocal : 1; bool VectorReduction : 1; bool AllowContract : 1; + bool IsNOF : 1; public: /// Default constructor turns off all optimization flags. @@ -371,7 +372,7 @@ : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), UnsafeAlgebra(false), NoNaNs(false), NoInfs(false), NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false), - AllowContract(false) {} + AllowContract(false), IsNOF(false) {} /// Sets the state of the flags to the defined state. void setDefined() { AnyDefined = true; } @@ -419,6 +420,10 @@ setDefined(); AllowContract = b; } + void setNOF(bool b) { + setDefined(); + IsNOF = b; + } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } @@ -431,6 +436,7 @@ bool hasAllowReciprocal() const { return AllowReciprocal; } bool hasVectorReduction() const { return VectorReduction; } bool hasAllowContract() const { return AllowContract; } + bool hasNOF() const { return IsNOF; } /// Clear any flags in this flag set that aren't also set in Flags. /// If the given Flags are undefined then don't do anything. @@ -447,6 +453,7 @@ AllowReciprocal &= Flags.AllowReciprocal; VectorReduction &= Flags.VectorReduction; AllowContract &= Flags.AllowContract; + IsNOF &= Flags.IsNOF; } }; Index: llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/CodeGen/TargetLowering.h +++ llvm/CodeGen/TargetLowering.h @@ -2801,6 +2801,12 @@ /// virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + /// Return true if it is profitable to expand integer division using + /// floating point division instruction + virtual bool isDesirableToExpandIDivWithFPDiv(EVT VT) const { + return false; + } + /// Return true if it is profitable to move a following shift through this // node, adjusting any immediate operands as necessary to preserve semantics. // This transformation may not be desirable if it disrupts a particularly