Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3477,8 +3477,7 @@ SDValue BuildSDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, bool IsAfterLegalization, std::vector *Created) const; - SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - bool IsAfterLegalization, + SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, std::vector *Created) const; /// Targets may override this function to provide custom SDIV lowering for Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3198,8 +3198,6 @@ SDLoc DL(N); EVT VT = N->getValueType(0); - ConstantSDNode *N1C = isConstOrConstSplat(N1); - // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -3231,7 +3229,8 @@ // fold (udiv x, c) -> alternate AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && + !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -17826,17 +17825,12 @@ if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); - ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); - if (!C) - return SDValue(); - - // Avoid division by zero. - if (C->isNullValue()) + SDValue N1 = N->getOperand(1); + if (!isConstantOrConstantVector(N1) || !DAG.isKnownNeverZero(N1)) return SDValue(); std::vector Built; - SDValue S = - TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); + SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, &Built); for (SDNode *N : Built) AddToWorklist(N); Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3439,74 +3439,140 @@ /// return a DAG expression to select that will generate the same value by /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". -SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, bool IsAfterLegalization, +SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, + bool IsAfterLegalization, std::vector *Created) const { assert(Created && "No vector to hold udiv ops."); - EVT VT = N->getValueType(0); SDLoc dl(N); auto &DL = DAG.getDataLayout(); + EVT VT = N->getValueType(0); + EVT ShVT = getShiftAmountTy(VT, DL); + // Check to see if we can do this. // FIXME: We should be more aggressive here. if (!isTypeLegal(VT)) return SDValue(); - // FIXME: We should use a narrower constant when the upper - // bits are known to be zero. - APInt::mu magics = Divisor.magicu(); - - SDValue Q = N->getOperand(0); - - // If the divisor is even, we can avoid using the expensive fixup by shifting - // the divided value upfront. - if (magics.a != 0 && !Divisor[0]) { - unsigned Shift = Divisor.countTrailingZeros(); - Q = DAG.getNode( - ISD::SRL, dl, VT, Q, - DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(), DL))); - Created->push_back(Q.getNode()); + auto BuildUDIVPattern = [](const APInt &Divisor, unsigned &PreShift, + APInt &Magic, unsigned &PostShift) { + // FIXME: We should use a narrower constant when the upper + // bits are known to be zero. + APInt::mu magics = Divisor.magicu(); + PreShift = PostShift = 0; + + // If the divisor is even, we can avoid using the expensive fixup by + // shifting the divided value upfront. + if (magics.a != 0 && !Divisor[0]) { + PreShift = Divisor.countTrailingZeros(); + // Get magic number for the shifted divisor. + magics = Divisor.lshr(PreShift).magicu(PreShift); + assert(magics.a == 0 && "Should use cheap fixup now"); + } + + Magic = magics.m; + + if (magics.a == 0) { + assert(magics.s < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + PostShift = magics.s; + return false; + } else { + PostShift = magics.s - 1; + return true; + } + }; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); - // Get magic number for the shifted divisor. - magics = Divisor.lshr(Shift).magicu(Shift); - assert(magics.a == 0 && "Should use cheap fixup now"); + // Collect the shifts/magic values from each element. + bool UseNPQ = false; + SDValue PreShift, PostShift, MagicFactor; + SmallVector ShuffleMask; + if (VT.isVector()) { + EVT SVT = VT.getScalarType(); + EVT ShSVT = ShVT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + SmallVector MagicFactors; + SmallVector PreShifts; + SmallVector PostShifts; + if (ISD::BUILD_VECTOR != N1.getOpcode()) + return SDValue(); + for (unsigned i = 0; i != NumElts; ++i) { + auto *C = dyn_cast(N1.getOperand(i)); + if (!C || C->isNullValue()) + return SDValue(); + APInt MagicVal; + unsigned PreShiftVal, PostShiftVal; + bool SelNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal, + PostShiftVal); + PreShifts.push_back(DAG.getConstant(PreShiftVal, dl, ShSVT)); + MagicFactors.push_back(DAG.getConstant(MagicVal, dl, SVT)); + PostShifts.push_back(DAG.getConstant(PostShiftVal, dl, ShSVT)); + ShuffleMask.push_back(SelNPQ ? i + NumElts : i); + UseNPQ |= SelNPQ; + } + if (!isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); + PreShift = DAG.getBuildVector(ShVT, dl, PreShifts); + MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors); + PostShift = DAG.getBuildVector(ShVT, dl, PostShifts); + } else { + auto *C = dyn_cast(N1); + if (!C || C->isNullValue()) + return SDValue(); + APInt MagicVal; + unsigned PreShiftVal, PostShiftVal; + UseNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal, + PostShiftVal); + PreShift = DAG.getConstant(PreShiftVal, dl, ShVT); + MagicFactor = DAG.getConstant(MagicVal, dl, VT); + PostShift = DAG.getConstant(PostShiftVal, dl, ShVT); } + SDValue Q = N0; + Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); + Created->push_back(Q.getNode()); + // Multiply the numerator (operand 0) by the magic value // FIXME: We should support doing a MUL in a wider type if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) : isOperationLegalOrCustom(ISD::MULHU, VT)) - Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT)); + Q = DAG.getNode(ISD::MULHU, dl, VT, Q, MagicFactor); else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) : - isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) - Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q, - DAG.getConstant(magics.m, dl, VT)).getNode(), 1); + isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { + SDValue LoHi = + DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q, MagicFactor); + Q = SDValue(LoHi.getNode(), 1); + } else return SDValue(); // No mulhu or equivalent Created->push_back(Q.getNode()); - if (magics.a == 0) { - assert(magics.s < Divisor.getBitWidth() && - "We shouldn't generate an undefined shift!"); - return DAG.getNode( - ISD::SRL, dl, VT, Q, - DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL))); - } else { - SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q); + if (UseNPQ) { + SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q); Created->push_back(NPQ.getNode()); + NPQ = DAG.getNode( - ISD::SRL, dl, VT, NPQ, - DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL))); + ISD::SRL, dl, VT, NPQ, + DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL))); Created->push_back(NPQ.getNode()); + NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q); Created->push_back(NPQ.getNode()); - return DAG.getNode( - ISD::SRL, dl, VT, NPQ, - DAG.getConstant(magics.s - 1, dl, - getShiftAmountTy(NPQ.getValueType(), DL))); + + if (VT.isVector()) { + Q = DAG.getVectorShuffle(VT, dl, Q, NPQ, ShuffleMask); + Created->push_back(Q.getNode()); + } else { + Q = NPQ; + } } + + return DAG.getNode(ISD::SRL, dl, VT, Q, PostShift); } bool TargetLowering:: Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -365,88 +365,63 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE-LABEL: combine_vec_udiv_nonuniform: ; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: movzwl %ax, %ecx -; SSE-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $8195, %eax, %eax # imm = 0x2003 -; SSE-NEXT: shrl $29, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: imull $9363, %eax, %eax # imm = 0x2493 -; SSE-NEXT: shrl $16, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: pextrw $4, %xmm0, %eax -; SSE-NEXT: shrl $7, %eax -; SSE-NEXT: pinsrw $4, %eax, %xmm1 -; SSE-NEXT: pextrw $5, %xmm0, %eax -; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; SSE-NEXT: sete %cl -; SSE-NEXT: pinsrw $5, %ecx, %xmm1 -; SSE-NEXT: pextrw $6, %xmm0, %eax -; SSE-NEXT: imull $32897, %eax, %eax # imm = 0x8081 -; SSE-NEXT: shrl $31, %eax -; SSE-NEXT: pinsrw $6, %eax, %xmm1 -; SSE-NEXT: pextrw $7, %xmm0, %eax -; SSE-NEXT: shrl $15, %eax -; SSE-NEXT: pinsrw $7, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $3, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5,6],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $4, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6],xmm0[7] ; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_nonuniform: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: movzwl %ax, %ecx -; AVX-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $8195, %eax, %eax # imm = 0x2003 -; AVX-NEXT: shrl $29, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: shrl $3, %eax -; AVX-NEXT: imull $9363, %eax, %eax # imm = 0x2493 -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: shrl $7, %eax -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; AVX-NEXT: sete %cl -; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: imull $32897, %eax, %eax # imm = 0x8081 -; AVX-NEXT: shrl $31, %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: shrl $15, %eax -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_nonuniform: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5,6],xmm0[7] +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6],xmm0[7] +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3,4],xmm1[5,6],xmm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_nonuniform: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = udiv <8 x i16> %x, ret <8 x i16> %1 }