diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3965,6 +3965,24 @@ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const; + // Return true if its desirable to perform the following transform: + // (fmul C, (uitofp Pow2)) + // -> (bitcast_to_FP (add (bitcast_to_INT C), Log2(Pow2) << mantissa)) + // (fdiv C, (uitofp Pow2)) + // -> (bitcast_to_FP (sub (bitcast_to_INT C), Log2(Pow2) << mantissa)) + // + // This is only queried after we have verified the transform will be bitwise + // equals. + // + // SDNode *N : The FDiv/FMul node we want to transform. + // SDValue FPConst: The Float constant operand in `N`. + // SDValue IntPow2: The Integer power of 2 operand in `N`. + virtual bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, + SDValue IntPow2) const { + // Default to avoiding fdiv which is often very expensive. + return N->getOpcode() == ISD::FDIV; + } + /// Return true if Op can create undef or poison from non-undef & non-poison /// operands. The DemandedElts argument limits the check to the requested /// vector elements. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16137,6 +16137,211 @@ return SDValue(); } +// This is basically just a port of takeLog2 from InstCombineMulDivRem.cpp +// +// Returns the node that represents `Log2(Op)`. This may create a new node. If +// we are unable to compute `Log2(Op)` its return `SDValue()`. +// +// All nodes will be created at `DL` and the output will be of type `VT`. +// +// This will only return `Log2(Op)` if we can prove `Op` is non-zero. Set +// `AssumeNonZero` if this function should simply assume (not require proving +// `Op` is non-zero). +static SDValue takeLog2(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Op, + unsigned Depth, bool AssumeNonZero) { + assert(VT.isInteger() && "Only integer types are supported!"); + + auto PeekThroughCastsAndTrunc = [](SDValue V) { + while (true) { + switch (V.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + V = V.getOperand(0); + break; + default: + return V; + } + } + }; + + Op = PeekThroughCastsAndTrunc(Op); + + // Helper for determining whether a value is a power-2 constant scalar or a + // vector of such elements. + SmallVector Pow2Constants; + auto IsPowerOfTwo = [&Pow2Constants](ConstantSDNode *C) { + if (C->isZero() || C->isOpaque()) + return false; + // TODO: We may also be able to support negative powers of 2 here. + if (C->getAPIntValue().isPowerOf2()) { + Pow2Constants.emplace_back(C->getAPIntValue()); + return true; + } + return false; + }; + + if (ISD::matchUnaryPredicate(Op, IsPowerOfTwo)) { + if (!VT.isVector()) + return DAG.getConstant(Pow2Constants.back().logBase2(), DL, VT); + // We need to create a build vector + SmallVector Log2Ops; + for (const APInt &Pow2 : Pow2Constants) + Log2Ops.emplace_back( + DAG.getConstant(Pow2.logBase2(), DL, VT.getScalarType())); + return DAG.getBuildVector(VT, DL, Log2Ops); + } + + if (Depth >= DAG.MaxRecursionDepth) + return SDValue(); + + auto CastToVT = [&](EVT NewVT, SDValue ToCast) { + EVT CurVT = ToCast.getValueType(); + ToCast = PeekThroughCastsAndTrunc(ToCast); + if (NewVT == CurVT) + return ToCast; + + if (NewVT.getSizeInBits() == CurVT.getSizeInBits()) + return DAG.getBitcast(NewVT, ToCast); + + return DAG.getZExtOrTrunc(ToCast, DL, NewVT); + }; + + // log2(X << Y) -> log2(X) + Y + if (Op.getOpcode() == ISD::SHL) { + // 1 << Y and X nuw/nsw << Y are all non-zero. + if (AssumeNonZero || Op->getFlags().hasNoUnsignedWrap() || + Op->getFlags().hasNoSignedWrap() || isOneConstant(Op.getOperand(0))) + if (SDValue LogX = + takeLog2(DAG, DL, VT, Op.getOperand(0), Depth + 1, AssumeNonZero)) + return DAG.getNode(ISD::ADD, DL, VT, LogX, + CastToVT(VT, Op.getOperand(1))); + } + + // c ? X : Y -> c ? Log2(X) : Log2(Y) + if (Op.getOpcode() == ISD::SELECT) { + if (SDValue LogX = + takeLog2(DAG, DL, VT, Op.getOperand(1), Depth + 1, AssumeNonZero)) + if (SDValue LogY = + takeLog2(DAG, DL, VT, Op.getOperand(2), Depth + 1, AssumeNonZero)) + return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY); + } + + // log2(umin(X, Y)) -> umin(log2(X), log2(Y)) + // log2(umax(X, Y)) -> umax(log2(X), log2(Y)) + if (Op.getOpcode() == ISD::UMIN || Op.getOpcode() == ISD::UMAX) { + // Use AssumeNonZero as false here. Otherwise we can hit case where + // log2(umax(X, Y)) != umax(log2(X), log2(Y)) (because overflow). + if (SDValue LogX = takeLog2(DAG, DL, VT, Op.getOperand(0), Depth + 1, + /*AssumeNonZero*/ false)) + if (SDValue LogY = takeLog2(DAG, DL, VT, Op.getOperand(1), Depth + 1, + /*AssumeNonZero*/ false)) + return DAG.getNode(Op.getOpcode(), DL, VT, LogX, LogY); + } + + return SDValue(); +} + +// Transform IEEE Floats: +// (fmul C, (uitofp Pow2)) +// -> (bitcast_to_FP (add (bitcast_to_INT C), Log2(Pow2) << mantissa)) +// (fdiv C, (uitofp Pow2)) +// -> (bitcast_to_FP (sub (bitcast_to_INT C), Log2(Pow2) << mantissa)) +// +// The rationale is fmul/fdiv by a power of 2 is just change the exponent, so +// there is no need for more than an add/sub. +// +// This is valid under the following circumstances: +// 1) We are dealing with IEEE floats +// 2) C is normal +// 3) The fmul/fdiv add/sub will not go outside of min/max exponent bounds. +static SDValue combineFMulOrFDivWithIntPow2(DAGCombiner *DC, + const TargetLowering &TLI, + SDNode *N) { + SelectionDAG &DAG = DC->getDAG(); + EVT VT = N->getValueType(0); + SDValue ConstOp, Pow2Op; + + int Mantissa = -1; + auto GetConstAndPow2Ops = [&](unsigned ConstOpIdx) { + if (ConstOpIdx == 1 && N->getOpcode() == ISD::FDIV) + return false; + + ConstOp = peekThroughBitcasts(N->getOperand(ConstOpIdx)); + Pow2Op = N->getOperand(1 - ConstOpIdx); + if (Pow2Op.getOpcode() != ISD::UINT_TO_FP && + (Pow2Op.getOpcode() != ISD::SINT_TO_FP || + !DAG.computeKnownBits(Pow2Op).isNonNegative())) + return false; + + Pow2Op = Pow2Op.getOperand(0); + + // TODO(1): We may be able to include undefs. + // TODO(2): We could also handle non-splat vector types. + ConstantFPSDNode *CFP = + isConstOrConstSplatFP(ConstOp, /*AllowUndefs*/ false); + if (CFP == nullptr) + return false; + const APFloat &APF = CFP->getValueAPF(); + + // Make sure we have normal/ieee constant. + if (!APF.isNormal() || !APF.isIEEE()) + return false; + + // `Log2(Pow2Op) < Pow2Op.getScalarSizeInBits()`. + // TODO: We could use knownbits to make this bound more precise. + int MaxExpChange = Pow2Op.getValueType().getScalarSizeInBits(); + + // Make sure the floats exponent is within the bounds that this transform + // produces bitwise equals value. + int CurExp = ilogb(APF); + // FMul by pow2 will only increase exponent. + int MinExp = N->getOpcode() == ISD::FMUL ? CurExp : (CurExp - MaxExpChange); + // FDiv by pow2 will only decrease exponent. + int MaxExp = N->getOpcode() == ISD::FDIV ? CurExp : (CurExp + MaxExpChange); + if (MinExp <= APFloat::semanticsMinExponent(APF.getSemantics()) || + MaxExp >= APFloat::semanticsMaxExponent(APF.getSemantics())) + return false; + + // Finally make sure we actually know the mantissa for the float type. + Mantissa = APFloat::semanticsPrecision(APF.getSemantics()) - 1; + return Mantissa > 0; + }; + + if (!GetConstAndPow2Ops(0) && !GetConstAndPow2Ops(1)) + return SDValue(); + + if (!TLI.optimizeFMulOrFDivAsShiftAddBitcast(N, ConstOp, Pow2Op)) + return SDValue(); + + // Get log2 after all other checks have taken place. This is because takeLog2 + // may create a new node. + SDLoc DL(N); + // Get Log2 type with same bitwidth as the float type (VT). + EVT NewIntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()); + if (VT.isVector()) + NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewIntVT, + VT.getVectorNumElements()); + + SDValue Log2 = takeLog2(DAG, DL, NewIntVT, Pow2Op, /*Depth*/ 0, + DAG.isKnownNeverZero(Pow2Op)); + if (!Log2) + return SDValue(); + + // Perform actual transform. + SDValue MantissaShiftCnt = + DAG.getConstant(Mantissa, DL, DC->getShiftAmountTy(NewIntVT)); + // TODO: Sometimes Log2 is of form `(X + C)`. `(X + C) << C1` should fold to + // `(X << C1) + (C << C1)`, but that isn't always the case because of the + // cast. We could implement that by handle here to handle the casts. + SDValue Shift = DAG.getNode(ISD::SHL, DL, NewIntVT, Log2, MantissaShiftCnt); + SDValue ResAsInt = + DAG.getNode(N->getOpcode() == ISD::FMUL ? ISD::ADD : ISD::SUB, DL, + NewIntVT, DAG.getBitcast(NewIntVT, ConstOp), Shift); + SDValue ResAsFP = DAG.getBitcast(VT, ResAsInt); + return ResAsFP; +} + SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -16277,6 +16482,11 @@ return Fused; } + // Don't do `combineFMulOrFDivWithIntPow2` until after FMUL -> FMA has been + // able to run. + if (SDValue R = combineFMulOrFDivWithIntPow2(this, this->TLI, N)) + return R; + return SDValue(); } @@ -16628,6 +16838,9 @@ return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1); } + if (SDValue R = combineFMulOrFDivWithIntPow2(this, this->TLI, N)) + return R; + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1796,6 +1796,9 @@ const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) const; + bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, + SDValue IntPow2) const override; + /// Check if replacement of SQRT with RSQRT should be disabled. bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25028,6 +25028,22 @@ return Sub.getValue(1); } +bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast( + SDNode *N, SDValue, SDValue IntPow2) const { + if (N->getOpcode() == ISD::FDIV) + return true; + + EVT FPVT = N->getValueType(0); + EVT IntVT = IntPow2.getValueType(); + + // This indicates a non-free bitcast. + if (FPVT.isVector() && + FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits()) + return false; + + return true; +} + /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -11,40 +11,19 @@ define double @fmul_pow_shl_cnt(i64 %cnt) { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm1 -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv @@ -54,40 +33,21 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm1 -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: incl %edi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: incl %edi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double %mul = fmul double -9.000000e+00, %conv @@ -97,37 +57,25 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: andl $1, %esi -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: subl %esi, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-SSE-NEXT: leal 1(%rdi), %eax +; CHECK-SSE-NEXT: testb $1, %sil +; CHECK-SSE-NEXT: cmovnel %edi, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_select: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: andl $1, %esi -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: subl %esi, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_select: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: andl $1, %esi -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: subl %esi, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_select: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-AVX-NEXT: leal 1(%rdi), %eax +; CHECK-AVX-NEXT: testb $1, %sil +; CHECK-AVX-NEXT: cmovnel %edi, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 @@ -139,40 +87,25 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $8, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-SSE-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-SSE-NEXT: cmovbq %rax, %rcx -; CHECK-SSE-NEXT: cvtsi2ss %rcx, %xmm0 -; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: addl $3, %edi +; CHECK-SSE-NEXT: cmpl $13, %edi +; CHECK-SSE-NEXT: movl $13, %eax +; CHECK-SSE-NEXT: cmovbl %edi, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $8, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-AVX2-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-AVX2-NEXT: cmovbq %rax, %rcx -; CHECK-AVX2-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $8, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-FMA-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-FMA-NEXT: cmovbq %rax, %rcx -; CHECK-FMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: addl $3, %edi +; CHECK-AVX-NEXT: cmpl $13, %edi +; CHECK-AVX-NEXT: movl $13, %eax +; CHECK-AVX-NEXT: cmovbl %edi, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float @@ -183,46 +116,27 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) { ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movl $1, %edx -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %edx -; CHECK-SSE-NEXT: cmpw %ax, %dx -; CHECK-SSE-NEXT: cmovbel %eax, %edx -; CHECK-SSE-NEXT: movzwl %dx, %eax -; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: movl %edi, %eax +; CHECK-SSE-NEXT: leaq 1(%rax), %rcx +; CHECK-SSE-NEXT: cmpq %rcx, %rax +; CHECK-SSE-NEXT: cmovaq %rax, %rcx +; CHECK-SSE-NEXT: shlq $52, %rcx +; CHECK-SSE-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-SSE-NEXT: addq %rcx, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movl $1, %edx -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %edx -; CHECK-AVX2-NEXT: cmpw %ax, %dx -; CHECK-AVX2-NEXT: cmovbel %eax, %edx -; CHECK-AVX2-NEXT: movzwl %dx, %eax -; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movl $1, %ecx -; CHECK-FMA-NEXT: shlxl %edi, %ecx, %ecx -; CHECK-FMA-NEXT: cmpw %ax, %cx -; CHECK-FMA-NEXT: cmoval %ecx, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_mul_max_pow2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movl %edi, %eax +; CHECK-AVX-NEXT: leaq 1(%rax), %rcx +; CHECK-AVX-NEXT: cmpq %rcx, %rax +; CHECK-AVX-NEXT: cmovaq %rax, %rcx +; CHECK-AVX-NEXT: shlq $52, %rcx +; CHECK-AVX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-AVX-NEXT: addq %rcx, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) @@ -352,43 +266,20 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -401,39 +292,14 @@ ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm0 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; CHECK-SSE-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: addps %xmm2, %xmm0 -; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; CHECK-AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] -; CHECK-AVX2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -501,44 +367,15 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,1] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -722,34 +559,21 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_safe: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double %mul = fmul double 9.745314e+288, %conv @@ -759,48 +583,18 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] -; CHECK-SSE-NEXT: divpd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-SSE-NEXT: psubq %xmm0, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-AVX2-NEXT: # xmm1 = mem[0,0] -; CHECK-AVX2-NEXT: vdivpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-FMA-NEXT: # xmm1 = mem[0,0] -; CHECK-FMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv @@ -810,79 +604,19 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,1] -; CHECK-SSE-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm3 -; CHECK-SSE-NEXT: movq %xmm3, %rax -; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB15_1 -; CHECK-SSE-NEXT: # %bb.2: -; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: jmp .LBB15_3 -; CHECK-SSE-NEXT: .LBB15_1: -; CHECK-SSE-NEXT: movq %rax, %rcx -; CHECK-SSE-NEXT: shrq %rcx -; CHECK-SSE-NEXT: andl $1, %eax -; CHECK-SSE-NEXT: orq %rcx, %rax -; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: addss %xmm1, %xmm1 -; CHECK-SSE-NEXT: .LBB15_3: -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-SSE-NEXT: movq %xmm0, %rax -; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB15_4 -; CHECK-SSE-NEXT: # %bb.5: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: jmp .LBB15_6 -; CHECK-SSE-NEXT: .LBB15_4: -; CHECK-SSE-NEXT: movq %rax, %rcx -; CHECK-SSE-NEXT: shrq %rcx -; CHECK-SSE-NEXT: andl $1, %eax -; CHECK-SSE-NEXT: orq %rcx, %rax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: addss %xmm0, %xmm0 -; CHECK-SSE-NEXT: .LBB15_6: -; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u> -; CHECK-SSE-NEXT: divps %xmm1, %xmm0 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE-NEXT: pslld $23, %xmm1 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm0 = <1065353216,1065353216,u,u> +; CHECK-SSE-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vmovq %xmm1, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fdiv <2 x float> , %conv @@ -1083,58 +817,19 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pushq %rax -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: popq %rax -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-SSE-NEXT: shll $10, %edi +; CHECK-SSE-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-SSE-NEXT: subl %edi, %eax +; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shll $10, %edi +; CHECK-AVX-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-AVX-NEXT: subl %edi, %eax +; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH7000, %conv @@ -1144,58 +839,19 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pushq %rax -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: popq %rax -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-SSE-NEXT: shll $10, %edi +; CHECK-SSE-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-SSE-NEXT: subl %edi, %eax +; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shll $10, %edi +; CHECK-AVX-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-AVX-NEXT: subl %edi, %eax +; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4800, %conv @@ -1263,5 +919,4 @@ ret half %mul } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-AVX: {{.*}} ; CHECK-AVX512F: {{.*}}