Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3497,8 +3497,7 @@ // SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, SmallVectorImpl &Created) const; - SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - bool IsAfterLegalization, + SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, SmallVectorImpl &Created) const; /// Targets may override this function to provide custom SDIV lowering for Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3278,8 +3278,6 @@ SDLoc DL(N); EVT VT = N->getValueType(0); - ConstantSDNode *N1C = isConstOrConstSplat(N1); - // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -3311,7 +3309,8 @@ // fold (udiv x, c) -> alternate AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && + !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -3468,6 +3467,19 @@ if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); + // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && + DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { + SDLoc DL(N); + unsigned NumEltBits = VT.getScalarSizeInBits(); + SDValue LogBase2 = BuildLogBase2(N1, DL); + SDValue SRLAmt = DAG.getNode( + ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); + return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); + } + // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { @@ -18099,21 +18111,14 @@ if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); - ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); - if (!C) - return SDValue(); - - // Avoid division by zero. - if (C->isNullValue()) - return SDValue(); - SmallVector Built; - SDValue S = - TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built); + if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { + for (SDNode *N : Built) + AddToWorklist(N); + return S; + } - for (SDNode *N : Built) - AddToWorklist(N); - return S; + return SDValue(); } /// Determines the LogBase2 value for a non-null input value using the Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3547,72 +3547,142 @@ /// return a DAG expression to select that will generate the same value by /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". -SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, bool IsAfterLegalization, +SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, + bool IsAfterLegalization, SmallVectorImpl &Created) const { - EVT VT = N->getValueType(0); SDLoc dl(N); auto &DL = DAG.getDataLayout(); + EVT VT = N->getValueType(0); + EVT ShVT = getShiftAmountTy(VT, DL); + // Check to see if we can do this. // FIXME: We should be more aggressive here. if (!isTypeLegal(VT)) return SDValue(); - // FIXME: We should use a narrower constant when the upper - // bits are known to be zero. - APInt::mu magics = Divisor.magicu(); - - SDValue Q = N->getOperand(0); - - // If the divisor is even, we can avoid using the expensive fixup by shifting - // the divided value upfront. - if (magics.a != 0 && !Divisor[0]) { - unsigned Shift = Divisor.countTrailingZeros(); - Q = DAG.getNode( - ISD::SRL, dl, VT, Q, - DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(), DL))); - Created.push_back(Q.getNode()); + auto BuildUDIVPattern = [](const APInt &Divisor, unsigned &PreShift, + APInt &Magic, unsigned &PostShift) { + // FIXME: We should use a narrower constant when the upper + // bits are known to be zero. + APInt::mu magics = Divisor.magicu(); + PreShift = PostShift = 0; + + // If the divisor is even, we can avoid using the expensive fixup by + // shifting the divided value upfront. + if (magics.a != 0 && !Divisor[0]) { + PreShift = Divisor.countTrailingZeros(); + // Get magic number for the shifted divisor. + magics = Divisor.lshr(PreShift).magicu(PreShift); + assert(magics.a == 0 && "Should use cheap fixup now"); + } + + Magic = magics.m; + + if (magics.a == 0) { + assert(magics.s < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + PostShift = magics.s; + return false; + } else { + PostShift = magics.s - 1; + return true; + } + }; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); - // Get magic number for the shifted divisor. - magics = Divisor.lshr(Shift).magicu(Shift); - assert(magics.a == 0 && "Should use cheap fixup now"); + // Collect the shifts/magic values from each element. + bool UseNPQ = false; + SDValue PreShift, PostShift, MagicFactor, NPQFactor; + if (VT.isVector()) { + EVT SVT = VT.getScalarType(); + EVT ShSVT = ShVT.getScalarType(); + unsigned EltBits = VT.getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; + if (ISD::BUILD_VECTOR != N1.getOpcode()) + return SDValue(); + for (unsigned i = 0; i != NumElts; ++i) { + auto *C = dyn_cast(N1.getOperand(i)); + if (!C || C->isNullValue() || C->getAPIntValue().getBitWidth() != EltBits) + return SDValue(); + APInt MagicVal; + unsigned PreShiftVal, PostShiftVal; + bool SelNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal, + PostShiftVal); + PreShifts.push_back(DAG.getConstant(PreShiftVal, dl, ShSVT)); + MagicFactors.push_back(DAG.getConstant(MagicVal, dl, SVT)); + NPQFactors.push_back( + DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1) + : APInt::getNullValue(EltBits), + dl, SVT)); + PostShifts.push_back(DAG.getConstant(PostShiftVal, dl, ShSVT)); + UseNPQ |= SelNPQ; + } + PreShift = DAG.getBuildVector(ShVT, dl, PreShifts); + MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors); + NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors); + PostShift = DAG.getBuildVector(ShVT, dl, PostShifts); + } else { + auto *C = dyn_cast(N1); + if (!C || C->isNullValue()) + return SDValue(); + APInt MagicVal; + unsigned PreShiftVal, PostShiftVal; + UseNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal, + PostShiftVal); + PreShift = DAG.getConstant(PreShiftVal, dl, ShVT); + MagicFactor = DAG.getConstant(MagicVal, dl, VT); + PostShift = DAG.getConstant(PostShiftVal, dl, ShVT); } - // Multiply the numerator (operand 0) by the magic value - // FIXME: We should support doing a MUL in a wider type - if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) : - isOperationLegalOrCustom(ISD::MULHU, VT)) - Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT)); - else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) : - isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) - Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q, - DAG.getConstant(magics.m, dl, VT)).getNode(), 1); - else - return SDValue(); // No mulhu or equivalent + SDValue Q = N0; + Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); + Created.push_back(Q.getNode()); + + // FIXME: We should support doing a MUL in a wider type. + auto GetMULHU = [&](SDValue X, SDValue Y) { + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) + : isOperationLegalOrCustom(ISD::MULHU, VT)) + return DAG.getNode(ISD::MULHU, dl, VT, X, Y); + if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { + SDValue LoHi = + DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y); + return SDValue(LoHi.getNode(), 1); + } + return SDValue(); // No mulhu or equivalent + }; + + // Multiply the numerator (operand 0) by the magic value. + Q = GetMULHU(Q, MagicFactor); + if (!Q) + return SDValue(); Created.push_back(Q.getNode()); - if (magics.a == 0) { - assert(magics.s < Divisor.getBitWidth() && - "We shouldn't generate an undefined shift!"); - return DAG.getNode( - ISD::SRL, dl, VT, Q, - DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL))); - } else { - SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q); + if (UseNPQ) { + SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q); Created.push_back(NPQ.getNode()); - NPQ = DAG.getNode( - ISD::SRL, dl, VT, NPQ, - DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL))); + + // For vectors we might have a mix of non-NPQ/NPQ paths, so use + // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero. + if (VT.isVector()) { + NPQ = GetMULHU(NPQ, NPQFactor); + } else { + NPQ = DAG.getNode( + ISD::SRL, dl, VT, NPQ, + DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL))); + } Created.push_back(NPQ.getNode()); - NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q); + + Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q); Created.push_back(NPQ.getNode()); - return DAG.getNode( - ISD::SRL, dl, VT, NPQ, - DAG.getConstant(magics.s - 1, dl, - getShiftAmountTy(NPQ.getValueType(), DL))); } + + return DAG.getNode(ISD::SRL, dl, VT, Q, PostShift); } bool TargetLowering:: Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -365,87 +365,32 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE-LABEL: combine_vec_udiv_nonuniform: ; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: movzwl %ax, %ecx -; SSE-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $8195, %eax, %eax # imm = 0x2003 -; SSE-NEXT: shrl $29, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: imull $9363, %eax, %eax # imm = 0x2493 -; SSE-NEXT: shrl $16, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: pextrw $4, %xmm0, %eax -; SSE-NEXT: shrl $7, %eax -; SSE-NEXT: pinsrw $4, %eax, %xmm1 -; SSE-NEXT: pextrw $5, %xmm0, %eax -; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; SSE-NEXT: sete %cl -; SSE-NEXT: pinsrw $5, %ecx, %xmm1 -; SSE-NEXT: pextrw $6, %xmm0, %eax -; SSE-NEXT: imull $32897, %eax, %eax # imm = 0x8081 -; SSE-NEXT: shrl $31, %eax -; SSE-NEXT: pinsrw $6, %eax, %xmm1 -; SSE-NEXT: pextrw $7, %xmm0, %eax -; SSE-NEXT: shrl $15, %eax -; SSE-NEXT: pinsrw $7, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $3, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: movl $32768, %eax # imm = 0x8000 +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: pmulhuw %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4096,2048,8,u,u,2,2,u> +; SSE-NEXT: pmulhuw %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6],xmm2[7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: movzwl %ax, %ecx -; AVX-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $8195, %eax, %eax # imm = 0x2003 -; AVX-NEXT: shrl $29, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: shrl $3, %eax -; AVX-NEXT: imull $9363, %eax, %eax # imm = 0x2493 -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: shrl $7, %eax -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; AVX-NEXT: sete %cl -; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: imull $32897, %eax, %eax # imm = 0x8081 -; AVX-NEXT: shrl $31, %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: shrl $15, %eax -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: movl $32768, %eax # imm = 0x8000 +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] ; AVX-NEXT: retq %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -454,77 +399,20 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { ; SSE-LABEL: combine_vec_udiv_nonuniform2: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pextrw $0, %xmm0, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009 -; SSE-NEXT: shrl $29, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $58255, %eax, %eax # imm = 0xE38F -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $32787, %eax, %eax # imm = 0x8013 -; SSE-NEXT: shrl $31, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: pextrw $4, %xmm0, %eax -; SSE-NEXT: imull $55189, %eax, %eax # imm = 0xD795 -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pinsrw $4, %eax, %xmm1 -; SSE-NEXT: pextrw $5, %xmm0, %eax -; SSE-NEXT: imull $8197, %eax, %eax # imm = 0x2005 -; SSE-NEXT: shrl $29, %eax -; SSE-NEXT: pinsrw $5, %eax, %xmm1 -; SSE-NEXT: pextrw $6, %xmm0, %eax -; SSE-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD -; SSE-NEXT: shrl $21, %eax -; SSE-NEXT: pinsrw $6, %eax, %xmm1 -; SSE-NEXT: pextrw $7, %xmm0, %eax -; SSE-NEXT: imull $32789, %eax, %eax # imm = 0x8015 -; SSE-NEXT: shrl $31, %eax -; SSE-NEXT: pinsrw $7, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $1, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform2: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009 -; AVX-NEXT: shrl $29, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $58255, %eax, %eax # imm = 0xE38F -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $32787, %eax, %eax # imm = 0x8013 -; AVX-NEXT: shrl $31, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: imull $55189, %eax, %eax # imm = 0xD795 -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: imull $8197, %eax, %eax # imm = 0x2005 -; AVX-NEXT: shrl $29, %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD -; AVX-NEXT: shrl $21, %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: imull $32789, %eax, %eax # imm = 0x8015 -; AVX-NEXT: shrl $31, %eax -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -533,157 +421,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { ; SSE-LABEL: combine_vec_udiv_nonuniform3: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: pextrw $4, %xmm0, %eax -; SSE-NEXT: imull $2115, %eax, %ecx # imm = 0x843 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $4, %eax -; SSE-NEXT: pinsrw $4, %eax, %xmm1 -; SSE-NEXT: pextrw $5, %xmm0, %eax -; SSE-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $5, %eax -; SSE-NEXT: pinsrw $5, %eax, %xmm1 -; SSE-NEXT: pextrw $6, %xmm0, %eax -; SSE-NEXT: imull $1041, %eax, %ecx # imm = 0x411 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $5, %eax -; SSE-NEXT: pinsrw $6, %eax, %xmm1 -; SSE-NEXT: pextrw $7, %xmm0, %eax -; SSE-NEXT: imull $517, %eax, %ecx # imm = 0x205 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movzwl %ax, %eax -; SSE-NEXT: shrl %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: shrl $6, %eax -; SSE-NEXT: pinsrw $7, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform3: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: imull $2115, %eax, %ecx # imm = 0x843 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $4, %eax -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $5, %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NEXT: imull $1041, %eax, %ecx # imm = 0x411 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $5, %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NEXT: imull $517, %eax, %ecx # imm = 0x205 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: movzwl %ax, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $6, %eax -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = udiv <8 x i16> %x, ret <8 x i16> %1