diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -810,6 +810,24 @@ return !XC; } + // Return true if its desirable to perform the following transform: + // (fmul C, (uitofp Pow2)) + // -> (bitcast_to_FP (add (bitcast_to_INT C), Log2(Pow2) << mantissa)) + // (fdiv C, (uitofp Pow2)) + // -> (bitcast_to_FP (sub (bitcast_to_INT C), Log2(Pow2) << mantissa)) + // + // This is only queried after we have verified the transform will be bitwise + // equals. + // + // SDNode *N : The FDiv/FMul node we want to transform. + // SDValue FPConst: The Float constant operand in `N`. + // SDValue IntPow2: The Integer power of 2 operand in `N`. + virtual bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, + SDValue IntPow2) const { + // Default to avoiding fdiv which is often very expensive. + return N->getOpcode() == ISD::FDIV; + } + /// These two forms are equivalent: /// sub %y, (xor %x, -1) /// add (add %x, 1), %y diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -611,6 +611,7 @@ SDValue CombineExtLoad(SDNode *N); SDValue CombineZExtLogicopShiftLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); + SDValue combineFMulOrFDivWithIntPow2(SDNode *N); SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex); @@ -620,7 +621,10 @@ SDValue BuildUDIV(SDNode *N); SDValue BuildSREMPow2(SDNode *N); SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N); - SDValue BuildLogBase2(SDValue V, const SDLoc &DL); + SDValue BuildLogBase2(SDValue V, const SDLoc &DL, + bool KnownNeverZero = false, + bool InexpensiveOnly = false, + std::optional OutVT = std::nullopt); SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); @@ -4389,12 +4393,12 @@ // fold (mul x, (1 << c)) -> x << c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && - DAG.isKnownToBeAPowerOfTwo(N1) && (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { - SDValue LogBase2 = BuildLogBase2(N1, DL); - EVT ShiftVT = getShiftAmountTy(N0.getValueType()); - SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); - return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); + if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); + return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); + } } // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c @@ -4916,31 +4920,31 @@ EVT VT = N->getValueType(0); // fold (udiv x, (1 << c)) -> x >>u c - if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && - DAG.isKnownToBeAPowerOfTwo(N1)) { - SDValue LogBase2 = BuildLogBase2(N1, DL); - AddToWorklist(LogBase2.getNode()); + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true)) { + if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { + AddToWorklist(LogBase2.getNode()); - EVT ShiftVT = getShiftAmountTy(N0.getValueType()); - SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); - AddToWorklist(Trunc.getNode()); - return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); + AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); + } } // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { SDValue N10 = N1.getOperand(0); - if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && - DAG.isKnownToBeAPowerOfTwo(N10)) { - SDValue LogBase2 = BuildLogBase2(N10, DL); - AddToWorklist(LogBase2.getNode()); - - EVT ADDVT = N1.getOperand(1).getValueType(); - SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); - AddToWorklist(Trunc.getNode()); - SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); - AddToWorklist(Add.getNode()); - return DAG.getNode(ISD::SRL, DL, VT, N0, Add); + if (isConstantOrConstantVector(N10, /*NoOpaques*/ true)) { + if (SDValue LogBase2 = BuildLogBase2(N10, DL)) { + AddToWorklist(LogBase2.getNode()); + + EVT ADDVT = N1.getOperand(1).getValueType(); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); + AddToWorklist(Trunc.getNode()); + SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); + AddToWorklist(Add.getNode()); + return DAG.getNode(ISD::SRL, DL, VT, N0, Add); + } } } @@ -5158,14 +5162,15 @@ // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && - DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { - unsigned NumEltBits = VT.getScalarSizeInBits(); - SDValue LogBase2 = BuildLogBase2(N1, DL); - SDValue SRLAmt = DAG.getNode( - ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); - EVT ShiftVT = getShiftAmountTy(N0.getValueType()); - SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); - return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); + hasOperation(ISD::SRL, VT)) { + if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { + unsigned NumEltBits = VT.getScalarSizeInBits(); + SDValue SRLAmt = DAG.getNode( + ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); + return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); + } } // If the type twice as wide is legal, transform the mulhu to a wider multiply @@ -16328,6 +16333,105 @@ return SDValue(); } +// Transform IEEE Floats: +// (fmul C, (uitofp Pow2)) +// -> (bitcast_to_FP (add (bitcast_to_INT C), Log2(Pow2) << mantissa)) +// (fdiv C, (uitofp Pow2)) +// -> (bitcast_to_FP (sub (bitcast_to_INT C), Log2(Pow2) << mantissa)) +// +// The rationale is fmul/fdiv by a power of 2 is just change the exponent, so +// there is no need for more than an add/sub. +// +// This is valid under the following circumstances: +// 1) We are dealing with IEEE floats +// 2) C is normal +// 3) The fmul/fdiv add/sub will not go outside of min/max exponent bounds. +// TODO: Much of this could also be used for generating `ldexp` on targets the +// prefer it. +SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue ConstOp, Pow2Op; + + int Mantissa = -1; + auto GetConstAndPow2Ops = [&](unsigned ConstOpIdx) { + if (ConstOpIdx == 1 && N->getOpcode() == ISD::FDIV) + return false; + + ConstOp = peekThroughBitcasts(N->getOperand(ConstOpIdx)); + Pow2Op = N->getOperand(1 - ConstOpIdx); + if (Pow2Op.getOpcode() != ISD::UINT_TO_FP && + (Pow2Op.getOpcode() != ISD::SINT_TO_FP || + !DAG.computeKnownBits(Pow2Op).isNonNegative())) + return false; + + Pow2Op = Pow2Op.getOperand(0); + + // TODO(1): We may be able to include undefs. + // TODO(2): We could also handle non-splat vector types. + ConstantFPSDNode *CFP = + isConstOrConstSplatFP(ConstOp, /*AllowUndefs*/ false); + if (CFP == nullptr) + return false; + const APFloat &APF = CFP->getValueAPF(); + + // Make sure we have normal/ieee constant. + if (!APF.isNormal() || !APF.isIEEE()) + return false; + + // `Log2(Pow2Op) < Pow2Op.getScalarSizeInBits()`. + // TODO: We could use knownbits to make this bound more precise. + int MaxExpChange = Pow2Op.getValueType().getScalarSizeInBits(); + + // Make sure the floats exponent is within the bounds that this transform + // produces bitwise equals value. + int CurExp = ilogb(APF); + // FMul by pow2 will only increase exponent. + int MinExp = N->getOpcode() == ISD::FMUL ? CurExp : (CurExp - MaxExpChange); + // FDiv by pow2 will only decrease exponent. + int MaxExp = N->getOpcode() == ISD::FDIV ? CurExp : (CurExp + MaxExpChange); + if (MinExp <= APFloat::semanticsMinExponent(APF.getSemantics()) || + MaxExp >= APFloat::semanticsMaxExponent(APF.getSemantics())) + return false; + + // Finally make sure we actually know the mantissa for the float type. + Mantissa = APFloat::semanticsPrecision(APF.getSemantics()) - 1; + return Mantissa > 0; + }; + + if (!GetConstAndPow2Ops(0) && !GetConstAndPow2Ops(1)) + return SDValue(); + + if (!TLI.optimizeFMulOrFDivAsShiftAddBitcast(N, ConstOp, Pow2Op)) + return SDValue(); + + // Get log2 after all other checks have taken place. This is because + // BuildLogBase2 may create a new node. + SDLoc DL(N); + // Get Log2 type with same bitwidth as the float type (VT). + EVT NewIntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()); + if (VT.isVector()) + NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewIntVT, + VT.getVectorNumElements()); + + SDValue Log2 = BuildLogBase2(Pow2Op, DL, DAG.isKnownNeverZero(Pow2Op), + /*InexpensiveOnly*/ true, NewIntVT); + if (!Log2) + return SDValue(); + + // Perform actual transform. + SDValue MantissaShiftCnt = + DAG.getConstant(Mantissa, DL, getShiftAmountTy(NewIntVT)); + // TODO: Sometimes Log2 is of form `(X + C)`. `(X + C) << C1` should fold to + // `(X << C1) + (C << C1)`, but that isn't always the case because of the + // cast. We could implement that by handle here to handle the casts. + SDValue Shift = DAG.getNode(ISD::SHL, DL, NewIntVT, Log2, MantissaShiftCnt); + SDValue ResAsInt = + DAG.getNode(N->getOpcode() == ISD::FMUL ? ISD::ADD : ISD::SUB, DL, + NewIntVT, DAG.getBitcast(NewIntVT, ConstOp), Shift); + SDValue ResAsFP = DAG.getBitcast(VT, ResAsInt); + return ResAsFP; +} + SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -16468,6 +16572,11 @@ return Fused; } + // Don't do `combineFMulOrFDivWithIntPow2` until after FMUL -> FMA has been + // able to run. + if (SDValue R = combineFMulOrFDivWithIntPow2(N)) + return R; + return SDValue(); } @@ -16819,6 +16928,9 @@ return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1); } + if (SDValue R = combineFMulOrFDivWithIntPow2(N)) + return R; + return SDValue(); } @@ -21861,7 +21973,7 @@ if (DAG.isKnownNeverZero(Index)) return DAG.getUNDEF(ScalarVT); - // Check if the result type doesn't match the inserted element type. + // Check if the result type doesn't match the inserted element type. // The inserted element and extracted element may have mismatched bitwidth. // As a result, EXTRACT_VECTOR_ELT may extend or truncate the extracted vector. SDValue InOp = VecOp.getOperand(0); @@ -27142,10 +27254,129 @@ return SDValue(); } +// This is basically just a port of takeLog2 from InstCombineMulDivRem.cpp +// +// Returns the node that represents `Log2(Op)`. This may create a new node. If +// we are unable to compute `Log2(Op)` its return `SDValue()`. +// +// All nodes will be created at `DL` and the output will be of type `VT`. +// +// This will only return `Log2(Op)` if we can prove `Op` is non-zero. Set +// `AssumeNonZero` if this function should simply assume (not require proving +// `Op` is non-zero). +static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT, + SDValue Op, unsigned Depth, + bool AssumeNonZero) { + assert(VT.isInteger() && "Only integer types are supported!"); + + auto PeekThroughCastsAndTrunc = [](SDValue V) { + while (true) { + switch (V.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ZERO_EXTEND: + V = V.getOperand(0); + break; + default: + return V; + } + } + }; + + if (VT.isScalableVector()) + return SDValue(); + + Op = PeekThroughCastsAndTrunc(Op); + + // Helper for determining whether a value is a power-2 constant scalar or a + // vector of such elements. + SmallVector Pow2Constants; + auto IsPowerOfTwo = [&Pow2Constants](ConstantSDNode *C) { + if (C->isZero() || C->isOpaque()) + return false; + // TODO: We may also be able to support negative powers of 2 here. + if (C->getAPIntValue().isPowerOf2()) { + Pow2Constants.emplace_back(C->getAPIntValue()); + return true; + } + return false; + }; + + if (ISD::matchUnaryPredicate(Op, IsPowerOfTwo)) { + if (!VT.isVector()) + return DAG.getConstant(Pow2Constants.back().logBase2(), DL, VT); + // We need to create a build vector + SmallVector Log2Ops; + for (const APInt &Pow2 : Pow2Constants) + Log2Ops.emplace_back( + DAG.getConstant(Pow2.logBase2(), DL, VT.getScalarType())); + return DAG.getBuildVector(VT, DL, Log2Ops); + } + + if (Depth >= DAG.MaxRecursionDepth) + return SDValue(); + + auto CastToVT = [&](EVT NewVT, SDValue ToCast) { + EVT CurVT = ToCast.getValueType(); + ToCast = PeekThroughCastsAndTrunc(ToCast); + if (NewVT == CurVT) + return ToCast; + + if (NewVT.getSizeInBits() == CurVT.getSizeInBits()) + return DAG.getBitcast(NewVT, ToCast); + + return DAG.getZExtOrTrunc(ToCast, DL, NewVT); + }; + + // log2(X << Y) -> log2(X) + Y + if (Op.getOpcode() == ISD::SHL) { + // 1 << Y and X nuw/nsw << Y are all non-zero. + if (AssumeNonZero || Op->getFlags().hasNoUnsignedWrap() || + Op->getFlags().hasNoSignedWrap() || isOneConstant(Op.getOperand(0))) + if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(0), + Depth + 1, AssumeNonZero)) + return DAG.getNode(ISD::ADD, DL, VT, LogX, + CastToVT(VT, Op.getOperand(1))); + } + + // c ? X : Y -> c ? Log2(X) : Log2(Y) + if ((Op.getOpcode() == ISD::SELECT || Op.getOpcode() == ISD::VSELECT) && + Op.hasOneUse()) { + if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1), + Depth + 1, AssumeNonZero)) + if (SDValue LogY = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(2), + Depth + 1, AssumeNonZero)) + return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY); + } + + // log2(umin(X, Y)) -> umin(log2(X), log2(Y)) + // log2(umax(X, Y)) -> umax(log2(X), log2(Y)) + if ((Op.getOpcode() == ISD::UMIN || Op.getOpcode() == ISD::UMAX) && + Op.hasOneUse()) { + // Use AssumeNonZero as false here. Otherwise we can hit case where + // log2(umax(X, Y)) != umax(log2(X), log2(Y)) (because overflow). + if (SDValue LogX = + takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(0), Depth + 1, + /*AssumeNonZero*/ false)) + if (SDValue LogY = + takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1), Depth + 1, + /*AssumeNonZero*/ false)) + return DAG.getNode(Op.getOpcode(), DL, VT, LogX, LogY); + } + + return SDValue(); +} + /// Determines the LogBase2 value for a non-null input value using the /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). -SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { - EVT VT = V.getValueType(); +SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL, + bool KnownNonZero, bool InexpensiveOnly, + std::optional OutVT) { + EVT VT = OutVT ? *OutVT : V.getValueType(); + SDValue InexpensiveLogBase2 = + takeInexpensiveLog2(DAG, DL, VT, V, /*Depth*/ 0, KnownNonZero); + if (InexpensiveLogBase2 || InexpensiveOnly || !DAG.isKnownToBeAPowerOfTwo(V)) + return InexpensiveLogBase2; + SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1808,6 +1808,9 @@ const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) const; + bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, + SDValue IntPow2) const override; + /// Check if replacement of SQRT with RSQRT should be disabled. bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22445,6 +22445,24 @@ return !VT.isVector() || Cond != ISD::CondCode::SETEQ; } +bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast( + SDNode *N, SDValue, SDValue IntPow2) const { + if (N->getOpcode() == ISD::FDIV) + return true; + + EVT FPVT = N->getValueType(0); + EVT IntVT = IntPow2.getValueType(); + + // This indicates a non-free bitcast. + // TODO: This is probably overly conservative as we will need to scale the + // integer vector anyways for the int->fp cast. + if (FPVT.isVector() && + FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits()) + return false; + + return true; +} + /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -102,197 +102,42 @@ ; VI-LABEL: fdiv_pow2_4xfloat: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s10, 0x41100000 -; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 1 -; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; VI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, s10 -; VI-NEXT: v_div_scale_f32 v6, vcc, s10, v0, s10 -; VI-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, s10 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 1 -; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 -; VI-NEXT: v_div_scale_f32 v7, s[4:5], s10, v1, s10 -; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 1 -; VI-NEXT: v_div_scale_f32 v9, s[6:7], v2, v2, s10 -; VI-NEXT: v_cvt_f32_u32_e32 v3, v3 -; VI-NEXT: v_rcp_f32_e32 v8, v4 -; VI-NEXT: v_rcp_f32_e32 v10, v5 -; VI-NEXT: v_fma_f32 v11, -v4, v8, 1.0 -; VI-NEXT: v_fma_f32 v8, v11, v8, v8 -; VI-NEXT: v_mul_f32_e32 v11, v6, v8 -; VI-NEXT: v_fma_f32 v12, -v4, v11, v6 -; VI-NEXT: v_fma_f32 v11, v12, v8, v11 -; VI-NEXT: v_fma_f32 v4, -v4, v11, v6 -; VI-NEXT: v_div_scale_f32 v6, s[6:7], s10, v2, s10 -; VI-NEXT: v_div_fmas_f32 v4, v4, v8, v11 -; VI-NEXT: v_div_scale_f32 v11, s[8:9], v3, v3, s10 -; VI-NEXT: v_fma_f32 v8, -v5, v10, 1.0 -; VI-NEXT: v_fma_f32 v8, v8, v10, v10 -; VI-NEXT: v_mul_f32_e32 v10, v7, v8 -; VI-NEXT: v_fma_f32 v12, -v5, v10, v7 -; VI-NEXT: v_fma_f32 v10, v12, v8, v10 -; VI-NEXT: v_div_scale_f32 v12, s[8:9], s10, v3, s10 -; VI-NEXT: v_rcp_f32_e32 v13, v9 -; VI-NEXT: v_fma_f32 v5, -v5, v10, v7 -; VI-NEXT: s_mov_b64 vcc, s[4:5] -; VI-NEXT: v_div_fmas_f32 v5, v5, v8, v10 -; VI-NEXT: v_fma_f32 v7, -v9, v13, 1.0 -; VI-NEXT: v_fma_f32 v7, v7, v13, v13 -; VI-NEXT: v_mul_f32_e32 v8, v6, v7 -; VI-NEXT: v_fma_f32 v10, -v9, v8, v6 -; VI-NEXT: v_fma_f32 v8, v10, v7, v8 -; VI-NEXT: v_rcp_f32_e32 v10, v11 -; VI-NEXT: v_fma_f32 v6, -v9, v8, v6 -; VI-NEXT: s_mov_b64 vcc, s[6:7] -; VI-NEXT: v_div_fmas_f32 v6, v6, v7, v8 -; VI-NEXT: v_fma_f32 v7, -v11, v10, 1.0 -; VI-NEXT: v_fma_f32 v7, v7, v10, v10 -; VI-NEXT: v_mul_f32_e32 v8, v12, v7 -; VI-NEXT: v_fma_f32 v9, -v11, v8, v12 -; VI-NEXT: v_fma_f32 v8, v9, v7, v8 -; VI-NEXT: v_fma_f32 v9, -v11, v8, v12 -; VI-NEXT: s_mov_b64 vcc, s[8:9] -; VI-NEXT: v_div_fmas_f32 v7, v9, v7, v8 -; VI-NEXT: v_div_fixup_f32 v0, v4, v0, s10 -; VI-NEXT: v_div_fixup_f32 v1, v5, v1, s10 -; VI-NEXT: v_div_fixup_f32 v2, v6, v2, s10 -; VI-NEXT: v_div_fixup_f32 v3, v7, v3, s10 +; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 23, v3 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41100000, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 0x41100000, v1 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41100000, v2 +; VI-NEXT: v_sub_u32_e32 v3, vcc, 0x41100000, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow2_4xfloat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX10-NEXT: v_div_scale_f32 v4, s4, v0, v0, 0x41100000 -; GFX10-NEXT: v_div_scale_f32 v5, s4, v1, v1, 0x41100000 -; GFX10-NEXT: v_div_scale_f32 v6, s4, v2, v2, 0x41100000 -; GFX10-NEXT: v_rcp_f32_e32 v8, v4 -; GFX10-NEXT: v_div_scale_f32 v7, s4, v3, v3, 0x41100000 -; GFX10-NEXT: v_rcp_f32_e32 v9, v5 -; GFX10-NEXT: v_rcp_f32_e32 v10, v6 -; GFX10-NEXT: v_div_scale_f32 v12, vcc_lo, 0x41100000, v0, 0x41100000 -; GFX10-NEXT: v_rcp_f32_e32 v11, v7 -; GFX10-NEXT: v_div_scale_f32 v16, s4, 0x41100000, v1, 0x41100000 -; GFX10-NEXT: v_fma_f32 v13, -v4, v8, 1.0 -; GFX10-NEXT: v_fma_f32 v14, -v5, v9, 1.0 -; GFX10-NEXT: v_fma_f32 v15, -v6, v10, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v8, v13, v8 -; GFX10-NEXT: v_div_scale_f32 v13, s5, 0x41100000, v2, 0x41100000 -; GFX10-NEXT: v_fma_f32 v17, -v7, v11, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v9, v14, v9 -; GFX10-NEXT: v_fmac_f32_e32 v10, v15, v10 -; GFX10-NEXT: v_mul_f32_e32 v15, v12, v8 -; GFX10-NEXT: v_div_scale_f32 v14, s6, 0x41100000, v3, 0x41100000 -; GFX10-NEXT: v_fmac_f32_e32 v11, v17, v11 -; GFX10-NEXT: v_mul_f32_e32 v17, v16, v9 -; GFX10-NEXT: v_mul_f32_e32 v18, v13, v10 -; GFX10-NEXT: v_fma_f32 v20, -v4, v15, v12 -; GFX10-NEXT: v_mul_f32_e32 v19, v14, v11 -; GFX10-NEXT: v_fma_f32 v21, -v5, v17, v16 -; GFX10-NEXT: v_fma_f32 v22, -v6, v18, v13 -; GFX10-NEXT: v_fmac_f32_e32 v15, v20, v8 -; GFX10-NEXT: v_fma_f32 v23, -v7, v19, v14 -; GFX10-NEXT: v_fmac_f32_e32 v17, v21, v9 -; GFX10-NEXT: v_fmac_f32_e32 v18, v22, v10 -; GFX10-NEXT: v_fma_f32 v4, -v4, v15, v12 -; GFX10-NEXT: v_fmac_f32_e32 v19, v23, v11 -; GFX10-NEXT: v_fma_f32 v5, -v5, v17, v16 -; GFX10-NEXT: v_fma_f32 v6, -v6, v18, v13 -; GFX10-NEXT: v_div_fmas_f32 v4, v4, v8, v15 -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_fma_f32 v7, -v7, v19, v14 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v9, v17 -; GFX10-NEXT: s_mov_b32 vcc_lo, s5 -; GFX10-NEXT: v_div_fixup_f32 v0, v4, v0, 0x41100000 -; GFX10-NEXT: v_div_fmas_f32 v6, v6, v10, v18 -; GFX10-NEXT: s_mov_b32 vcc_lo, s6 -; GFX10-NEXT: v_div_fixup_f32 v1, v5, v1, 0x41100000 -; GFX10-NEXT: v_div_fmas_f32 v7, v7, v11, v19 -; GFX10-NEXT: v_div_fixup_f32 v2, v6, v2, 0x41100000 -; GFX10-NEXT: v_div_fixup_f32 v3, v7, v3, 0x41100000 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 23, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 23, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow2_4xfloat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 1 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 23, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 23, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_div_scale_f32 v4, null, v0, v0, 0x41100000 -; GFX11-NEXT: v_div_scale_f32 v6, null, v2, v2, 0x41100000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_div_scale_f32 v5, null, v1, v1, 0x41100000 -; GFX11-NEXT: v_div_scale_f32 v7, null, v3, v3, 0x41100000 -; GFX11-NEXT: v_rcp_f32_e32 v8, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f32_e32 v10, v6 -; GFX11-NEXT: v_rcp_f32_e32 v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) -; GFX11-NEXT: v_rcp_f32_e32 v11, v7 -; GFX11-NEXT: v_div_scale_f32 v12, vcc_lo, 0x41100000, v0, 0x41100000 -; GFX11-NEXT: v_div_scale_f32 v16, s0, 0x41100000, v1, 0x41100000 -; GFX11-NEXT: v_fma_f32 v13, -v4, v8, 1.0 -; GFX11-NEXT: v_fma_f32 v15, -v6, v10, 1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v14, -v5, v9, 1.0 -; GFX11-NEXT: v_fma_f32 v17, -v7, v11, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v8, v13, v8 -; GFX11-NEXT: v_fmac_f32_e32 v10, v15, v10 -; GFX11-NEXT: v_div_scale_f32 v13, s1, 0x41100000, v2, 0x41100000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fmac_f32_e32 v11, v17, v11 -; GFX11-NEXT: v_fmac_f32_e32 v9, v14, v9 -; GFX11-NEXT: v_dual_mul_f32 v15, v12, v8 :: v_dual_mul_f32 v18, v13, v10 -; GFX11-NEXT: v_div_scale_f32 v14, s2, 0x41100000, v3, 0x41100000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f32 v20, -v4, v15, v12 -; GFX11-NEXT: v_fma_f32 v22, -v6, v18, v13 -; GFX11-NEXT: v_mul_f32_e32 v17, v16, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_f32_e32 v19, v14, v11 -; GFX11-NEXT: v_dual_fmac_f32 v15, v20, v8 :: v_dual_fmac_f32 v18, v22, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f32 v21, -v5, v17, v16 -; GFX11-NEXT: v_fma_f32 v23, -v7, v19, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_fma_f32 v4, -v4, v15, v12 -; GFX11-NEXT: v_fma_f32 v6, -v6, v18, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_fmac_f32_e32 v17, v21, v9 -; GFX11-NEXT: v_fmac_f32_e32 v19, v23, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_div_fmas_f32 v4, v4, v8, v15 -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_fma_f32 v5, -v5, v17, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f32 v7, -v7, v19, v14 -; GFX11-NEXT: v_div_fixup_f32 v0, v4, v0, 0x41100000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f32 v5, v5, v9, v17 -; GFX11-NEXT: s_mov_b32 vcc_lo, s1 -; GFX11-NEXT: v_div_fmas_f32 v6, v6, v10, v18 -; GFX11-NEXT: s_mov_b32 vcc_lo, s2 -; GFX11-NEXT: v_div_fixup_f32 v1, v5, v1, 0x41100000 -; GFX11-NEXT: v_div_fmas_f32 v7, v7, v11, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fixup_f32 v2, v6, v2, 0x41100000 -; GFX11-NEXT: v_div_fixup_f32 v3, v7, v3, 0x41100000 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> @@ -469,187 +314,56 @@ ; VI-LABEL: fdiv_pow2_8xhalf: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v4, v3, 1 -; VI-NEXT: v_mov_b32_e32 v5, 1 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, v0, 1 -; VI-NEXT: v_cvt_f16_u16_e32 v4, v4 -; VI-NEXT: v_lshlrev_b16_e64 v6, v2, 1 -; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v7, v1, 1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cvt_f16_u16_e32 v5, v8 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; VI-NEXT: v_cvt_f16_u16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; VI-NEXT: v_cvt_f16_u16_e32 v6, v6 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; VI-NEXT: v_rcp_f32_e32 v9, v9 -; VI-NEXT: v_cvt_f16_u16_e32 v2, v2 -; VI-NEXT: v_mul_f32_e32 v8, 0x46000000, v8 -; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v9, 0x46000000, v9 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; VI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; VI-NEXT: v_rcp_f32_e32 v10, v10 -; VI-NEXT: s_movk_i32 s4, 0x7000 -; VI-NEXT: v_cvt_f16_u16_e32 v7, v7 -; VI-NEXT: v_div_fixup_f16 v4, v8, v4, s4 -; VI-NEXT: v_rcp_f32_e32 v8, v11 -; VI-NEXT: v_div_fixup_f16 v3, v9, v3, s4 -; VI-NEXT: v_mul_f32_e32 v9, 0x46000000, v10 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; VI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; VI-NEXT: v_mul_f32_e32 v8, 0x46000000, v8 -; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; VI-NEXT: v_rcp_f32_e32 v10, v10 -; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 -; VI-NEXT: v_cvt_f16_u16_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v6, v9, v6, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; VI-NEXT: v_div_fixup_f16 v2, v8, v2, s4 -; VI-NEXT: v_mul_f32_e32 v8, 0x46000000, v10 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; VI-NEXT: v_rcp_f32_e32 v9, v9 -; VI-NEXT: v_rcp_f32_e32 v11, v11 -; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; VI-NEXT: v_rcp_f32_e32 v10, v10 -; VI-NEXT: v_mul_f32_e32 v9, 0x46000000, v9 -; VI-NEXT: v_mul_f32_e32 v11, 0x46000000, v11 -; VI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; VI-NEXT: v_mul_f32_e32 v10, 0x46000000, v10 -; VI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; VI-NEXT: v_div_fixup_f16 v1, v9, v1, s4 -; VI-NEXT: v_div_fixup_f16 v7, v8, v7, s4 -; VI-NEXT: v_div_fixup_f16 v0, v11, v0, s4 -; VI-NEXT: v_div_fixup_f16 v5, v10, v5, s4 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: v_or_b32_e32 v1, v7, v1 -; VI-NEXT: v_or_b32_e32 v2, v6, v2 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, 10 +; VI-NEXT: v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v6, 0x7000 +; VI-NEXT: v_lshlrev_b16_e32 v3, 10, v3 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v2, 10, v2 +; VI-NEXT: v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 10, v1 +; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 +; VI-NEXT: v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 +; VI-NEXT: v_sub_u16_e32 v1, 0x7000, v1 +; VI-NEXT: v_sub_u16_e32 v2, 0x7000, v2 +; VI-NEXT: v_sub_u16_e32 v3, 0x7000, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_e32 v1, v1, v8 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow2_8xhalf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-NEXT: s_mov_b32 s4, 0x46000000 -; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v3 -; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v2 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v10, v7 -; GFX10-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GFX10-NEXT: v_rcp_f32_e32 v8, v8 -; GFX10-NEXT: v_cvt_f32_f16_e32 v11, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v12, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GFX10-NEXT: v_rcp_f32_e32 v10, v10 -; GFX10-NEXT: v_cvt_f32_f16_e32 v14, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v15, v4 -; GFX10-NEXT: v_rcp_f32_e32 v9, v9 -; GFX10-NEXT: v_rcp_f32_e32 v11, v11 -; GFX10-NEXT: v_rcp_f32_e32 v12, v12 -; GFX10-NEXT: v_rcp_f32_e32 v14, v14 -; GFX10-NEXT: v_rcp_f32_e32 v15, v15 -; GFX10-NEXT: v_rcp_f32_e32 v13, v13 -; GFX10-NEXT: v_fma_mixlo_f16 v8, v8, s4, 0 -; GFX10-NEXT: v_fma_mixlo_f16 v10, v10, s4, 0 -; GFX10-NEXT: v_div_fixup_f16 v5, v8, v5, 0x7000 -; GFX10-NEXT: v_fma_mixlo_f16 v8, v9, s4, 0 -; GFX10-NEXT: v_div_fixup_f16 v7, v10, v7, 0x7000 -; GFX10-NEXT: v_fma_mixlo_f16 v9, v12, s4, 0 -; GFX10-NEXT: v_fma_mixlo_f16 v10, v14, s4, 0 -; GFX10-NEXT: v_fma_mixlo_f16 v12, v15, s4, 0 -; GFX10-NEXT: v_fma_mixlo_f16 v13, v13, s4, 0 -; GFX10-NEXT: v_fma_mixlo_f16 v11, v11, s4, 0 -; GFX10-NEXT: v_div_fixup_f16 v1, v9, v1, 0x7000 -; GFX10-NEXT: v_div_fixup_f16 v0, v10, v0, 0x7000 -; GFX10-NEXT: v_div_fixup_f16 v4, v12, v4, 0x7000 -; GFX10-NEXT: v_div_fixup_f16 v6, v13, v6, 0x7000 -; GFX10-NEXT: v_div_fixup_f16 v2, v11, v2, 0x7000 -; GFX10-NEXT: v_div_fixup_f16 v3, v8, v3, 0x7000 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v4 -; GFX10-NEXT: v_pack_b32_f16 v1, v1, v6 -; GFX10-NEXT: v_pack_b32_f16 v2, v7, v2 -; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow2_8xhalf: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_mov_b32 s0, 0x46000000 -; GFX11-NEXT: v_cvt_f16_u16_e32 v4, v3 -; GFX11-NEXT: v_cvt_f16_u16_e32 v5, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX11-NEXT: v_cvt_f16_u16_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_u16_e32 v6, v6 -; GFX11-NEXT: v_cvt_f16_u16_e32 v9, v9 -; GFX11-NEXT: v_cvt_f16_u16_e32 v2, v2 -; GFX11-NEXT: v_rcp_f32_e32 v7, v7 -; GFX11-NEXT: v_rcp_f32_e32 v8, v8 -; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v9 -; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v2 -; GFX11-NEXT: v_rcp_f32_e32 v10, v10 -; GFX11-NEXT: v_rcp_f32_e32 v11, v11 -; GFX11-NEXT: v_rcp_f32_e32 v12, v12 -; GFX11-NEXT: v_rcp_f32_e32 v13, v13 -; GFX11-NEXT: v_rcp_f32_e32 v14, v14 -; GFX11-NEXT: v_rcp_f32_e32 v15, v15 -; GFX11-NEXT: v_fma_mixlo_f16 v7, v7, s0, 0 -; GFX11-NEXT: v_fma_mixlo_f16 v8, v8, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_div_fixup_f16 v4, v7, v4, 0x7000 -; GFX11-NEXT: v_fma_mixlo_f16 v7, v10, s0, 0 -; GFX11-NEXT: v_div_fixup_f16 v5, v8, v5, 0x7000 -; GFX11-NEXT: v_fma_mixlo_f16 v8, v11, s0, 0 -; GFX11-NEXT: v_fma_mixlo_f16 v10, v12, s0, 0 -; GFX11-NEXT: v_fma_mixlo_f16 v11, v13, s0, 0 -; GFX11-NEXT: v_fma_mixlo_f16 v12, v14, s0, 0 -; GFX11-NEXT: v_fma_mixlo_f16 v13, v15, s0, 0 -; GFX11-NEXT: v_div_fixup_f16 v1, v8, v1, 0x7000 -; GFX11-NEXT: v_div_fixup_f16 v0, v10, v0, 0x7000 -; GFX11-NEXT: v_div_fixup_f16 v6, v11, v6, 0x7000 -; GFX11-NEXT: v_div_fixup_f16 v8, v12, v9, 0x7000 -; GFX11-NEXT: v_div_fixup_f16 v2, v13, v2, 0x7000 -; GFX11-NEXT: v_div_fixup_f16 v3, v7, v3, 0x7000 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6 -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v8 +; GFX11-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2 -; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3 +; GFX11-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> @@ -2342,129 +2056,37 @@ ; VI-LABEL: fdiv_pow_shl_cnt_vec: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1 -; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 1 -; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2 -; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32 -; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32 -; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0 -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] -; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1] -; VI-NEXT: v_div_scale_f64 v[0:1], s[4:5], v[2:3], v[2:3], 1.0 -; VI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0 -; VI-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[4:5], 1.0 -; VI-NEXT: v_rcp_f64_e32 v[8:9], v[0:1] -; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; VI-NEXT: v_fma_f64 v[12:13], -v[0:1], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 -; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; VI-NEXT: v_fma_f64 v[14:15], -v[0:1], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; VI-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], v[14:15], v[12:13] -; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; VI-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[8:9], v[14:15] -; VI-NEXT: s_mov_b64 vcc, s[4:5] -; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; VI-NEXT: v_div_fixup_f64 v[0:1], v[0:1], v[2:3], 1.0 -; VI-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[4:5], 1.0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; VI-NEXT: v_subb_u32_e64 v1, s[4:5], v3, v1, vcc +; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v2, vcc +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_vec: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 -; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 -; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 -; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 -; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 -; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] -; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s4, 0x3ff00000, v1, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_vec: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 -; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 -; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] -; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x3ff00000, v1, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -2564,155 +2186,29 @@ ; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 1 -; VI-NEXT: v_ffbh_u32_e32 v3, v2 -; VI-NEXT: v_min_u32_e32 v5, 32, v3 -; VI-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2] -; VI-NEXT: v_lshlrev_b64 v[3:4], v0, 1 -; VI-NEXT: v_min_u32_e32 v0, 1, v1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_cvt_f32_u32_e32 v2, v0 -; VI-NEXT: v_ffbh_u32_e32 v0, v4 -; VI-NEXT: v_min_u32_e32 v6, 32, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], v6, v[3:4] -; VI-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 -; VI-NEXT: v_min_u32_e32 v0, 1, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; VI-NEXT: v_ldexp_f32 v1, v2, v3 -; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 -; VI-NEXT: v_ldexp_f32 v0, v0, v2 -; VI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; VI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; VI-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 -; VI-NEXT: v_rcp_f32_e32 v6, v2 -; VI-NEXT: v_rcp_f32_e32 v7, v3 -; VI-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; VI-NEXT: v_fma_f32 v6, v8, v6, v6 -; VI-NEXT: v_mul_f32_e32 v8, v4, v6 -; VI-NEXT: v_fma_f32 v10, -v2, v8, v4 -; VI-NEXT: v_fma_f32 v9, -v3, v7, 1.0 -; VI-NEXT: v_fma_f32 v8, v10, v6, v8 -; VI-NEXT: v_fma_f32 v2, -v2, v8, v4 -; VI-NEXT: v_fma_f32 v4, v9, v7, v7 -; VI-NEXT: v_div_fmas_f32 v2, v2, v6, v8 -; VI-NEXT: v_mul_f32_e32 v6, v5, v4 -; VI-NEXT: v_fma_f32 v7, -v3, v6, v5 -; VI-NEXT: v_fma_f32 v6, v7, v4, v6 -; VI-NEXT: v_fma_f32 v3, -v3, v6, v5 -; VI-NEXT: s_mov_b64 vcc, s[4:5] -; VI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; VI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; VI-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 1.0, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 1.0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 -; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] -; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX10-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v9 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 -; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 -; GFX11-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-NEXT: v_rcp_f32_e32 v5, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX11-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 -; GFX11-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 -; GFX11-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX11-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX11-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v9 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> @@ -3352,41 +2848,23 @@ ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 -; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: s_movk_i32 s4, 0x7000 -; VI-NEXT: v_rcp_f32_e32 v1, v1 -; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4 +; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 +; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 -; GFX10-NEXT: s_mov_b32 s4, 0x46000000 -; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 +; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, 0x7000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 -; GFX11-NEXT: s_mov_b32 s0, 0x46000000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 +; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u16 v0, 0x7000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half @@ -3466,41 +2944,23 @@ ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 -; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: s_movk_i32 s4, 0x4800 -; VI-NEXT: v_rcp_f32_e32 v1, v1 -; VI-NEXT: v_mul_f32_e32 v1, 0x41000000, v1 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4 +; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 +; VI-NEXT: v_sub_u16_e32 v0, 0x4800, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 -; GFX10-NEXT: s_mov_b32 s4, 0x41000000 -; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x4800 +; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, 0x4800, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 -; GFX11-NEXT: s_mov_b32 s0, 0x41000000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x4800 +; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u16 v0, 0x4800, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half @@ -3667,67 +3127,30 @@ ; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, 0x36a00000 -; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_div_scale_f64 v[6:7], vcc, s[4:5], v[0:1], s[4:5] -; VI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; VI-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; VI-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000 +; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x36a00000 -; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s6, v[0:1], v[0:1], s[4:5] -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, s[4:5], v[0:1], s[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x36a00000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, s[0:1], v[0:1], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; GFX11-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double @@ -3888,62 +3311,23 @@ ; VI-LABEL: fdiv_pow_shl_cnt32_okay: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s6, 0x11000000 -; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 -; VI-NEXT: v_rcp_f32_e32 v3, v1 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 +; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x11000000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: fdiv_pow_shl_cnt32_okay: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x11000000 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x11000000, v0, 0x11000000 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x11000000 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fdiv_pow_shl_cnt32_okay: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x11000000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x11000000, v0, 0x11000000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x11000000 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -14,49 +14,26 @@ ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; CHECK-SSE-NEXT: pand %xmm0, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: addps %xmm1, %xmm0 -; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow2_4xfloat: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; CHECK-AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow2_4xfloat: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper +; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow2_4xfloat: ; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtudq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> @@ -142,53 +119,17 @@ ; CHECK-SSE-LABEL: fdiv_pow2_4xfloat: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; CHECK-SSE-NEXT: pand %xmm1, %xmm0 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrld $16, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addps %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-SSE-NEXT: divps %xmm1, %xmm0 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-SSE-NEXT: psubd %xmm0, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow2_4xfloat: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; CHECK-AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_4xfloat: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow2_4xfloat: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtudq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow2_4xfloat: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616] +; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %p2 = shl <4 x i32> , %i %p2_f = uitofp <4 x i32> %p2 to <4 x float> %r = fdiv <4 x float> , %p2_f @@ -797,389 +738,18 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-LABEL: fdiv_pow2_8xhalf: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: subq $88, %rsp -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 96 -; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-SSE-NEXT: pslld $23, %xmm1 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] -; CHECK-SSE-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd %xmm2, %xmm0 -; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-SSE-NEXT: addq $88, %rsp -; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-SSE-NEXT: psllw $10, %xmm0 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-SSE-NEXT: psubw %xmm0, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: subq $120, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 128 -; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 -; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX2-NEXT: addq $120, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm1 -; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm1, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; CHECK-NO-FASTFMA-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NO-FASTFMA-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm4, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NO-FASTFMA-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-NO-FASTFMA-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; CHECK-NO-FASTFMA-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm4, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 -; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NO-FASTFMA-NEXT: vpextrw $0, %xmm1, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm1, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NO-FASTFMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; CHECK-NO-FASTFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] -; CHECK-NO-FASTFMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow2_8xhalf: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; CHECK-FMA-NEXT: vmovd %xmm1, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; CHECK-FMA-NEXT: vmovd %xmm2, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; CHECK-FMA-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-FMA-NEXT: vmovd %xmm3, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; CHECK-FMA-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-FMA-NEXT: vmovd %xmm4, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 -; CHECK-FMA-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm5 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; CHECK-FMA-NEXT: vmovd %xmm5, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 -; CHECK-FMA-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm6, %xmm6 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-FMA-NEXT: vmovd %xmm6, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; CHECK-FMA-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm7, %xmm4 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-FMA-NEXT: vmovd %xmm4, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 -; CHECK-FMA-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 -; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-FMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0 -; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] -; CHECK-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; CHECK-FMA-NEXT: vzeroupper -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow2_8xhalf: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %p2 = shl <8 x i16> , %i %p2_f = uitofp <8 x i16> %p2 to <8 x half> %r = fdiv <8 x half> , %p2_f @@ -1189,50 +759,19 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm1 -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double %mul = fmul double 9.000000e+00, %conv @@ -1242,50 +781,21 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm1 -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE-NEXT: addsd %xmm1, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: incl %edi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vmovq %rax, %xmm0 -; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $2, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: incl %edi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double %mul = fmul double -9.000000e+00, %conv @@ -1295,49 +805,25 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: andl $1, %esi -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: subl %esi, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-SSE-NEXT: leal 1(%rdi), %eax +; CHECK-SSE-NEXT: testb $1, %sil +; CHECK-SSE-NEXT: cmovnel %edi, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_select: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: andl $1, %esi -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: subl %esi, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_select: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: andl $1, %esi -; CHECK-NO-FASTFMA-NEXT: movl $2, %eax -; CHECK-NO-FASTFMA-NEXT: subl %esi, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_select: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: andl $1, %esi -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: subl %esi, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_select: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-AVX-NEXT: leal 1(%rdi), %eax +; CHECK-AVX-NEXT: testb $1, %sil +; CHECK-AVX-NEXT: cmovnel %edi, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 @@ -1349,53 +835,25 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movq %rdi, %rcx -; CHECK-SSE-NEXT: movl $8, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-SSE-NEXT: shlq %cl, %rax -; CHECK-SSE-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-SSE-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-SSE-NEXT: cmovbq %rax, %rcx -; CHECK-SSE-NEXT: cvtsi2ss %rcx, %xmm0 -; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: addl $3, %edi +; CHECK-SSE-NEXT: cmpl $13, %edi +; CHECK-SSE-NEXT: movl $13, %eax +; CHECK-SSE-NEXT: cmovbl %edi, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movq %rdi, %rcx -; CHECK-AVX2-NEXT: movl $8, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-AVX2-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-AVX2-NEXT: cmovbq %rax, %rcx -; CHECK-AVX2-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx -; CHECK-NO-FASTFMA-NEXT: movl $8, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-NO-FASTFMA-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-NO-FASTFMA-NEXT: cmovbq %rax, %rcx -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $8, %eax -; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: cmpq $8192, %rax # imm = 0x2000 -; CHECK-FMA-NEXT: movl $8192, %ecx # imm = 0x2000 -; CHECK-FMA-NEXT: cmovbq %rax, %rcx -; CHECK-FMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: addl $3, %edi +; CHECK-AVX-NEXT: cmpl $13, %edi +; CHECK-AVX-NEXT: movl $13, %eax +; CHECK-AVX-NEXT: cmovbl %edi, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float @@ -1406,61 +864,27 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $2, %eax -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movl $1, %edx -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %edx -; CHECK-SSE-NEXT: cmpw %ax, %dx -; CHECK-SSE-NEXT: cmovbel %eax, %edx -; CHECK-SSE-NEXT: movzwl %dx, %eax -; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: movl %edi, %eax +; CHECK-SSE-NEXT: leaq 1(%rax), %rcx +; CHECK-SSE-NEXT: cmpq %rcx, %rax +; CHECK-SSE-NEXT: cmovaq %rax, %rcx +; CHECK-SSE-NEXT: shlq $52, %rcx +; CHECK-SSE-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-SSE-NEXT: addq %rcx, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $2, %eax -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movl $1, %edx -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %edx -; CHECK-AVX2-NEXT: cmpw %ax, %dx -; CHECK-AVX2-NEXT: cmovbel %eax, %edx -; CHECK-AVX2-NEXT: movzwl %dx, %eax -; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_mul_max_pow2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $2, %eax -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: movl $1, %edx -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %edx -; CHECK-NO-FASTFMA-NEXT: cmpw %ax, %dx -; CHECK-NO-FASTFMA-NEXT: cmovbel %eax, %edx -; CHECK-NO-FASTFMA-NEXT: movzwl %dx, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $2, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movl $1, %ecx -; CHECK-FMA-NEXT: shlxl %edi, %ecx, %ecx -; CHECK-FMA-NEXT: cmpw %ax, %cx -; CHECK-FMA-NEXT: cmoval %ecx, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_mul_max_pow2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movl %edi, %eax +; CHECK-AVX-NEXT: leaq 1(%rax), %rcx +; CHECK-AVX-NEXT: cmpq %rcx, %rax +; CHECK-AVX-NEXT: cmovaq %rax, %rcx +; CHECK-AVX-NEXT: shlq $52, %rcx +; CHECK-AVX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 +; CHECK-AVX-NEXT: addq %rcx, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl2 = shl nuw i16 2, %cnt %shl1 = shl nuw i16 1, %cnt %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) @@ -1612,57 +1036,26 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -1675,51 +1068,23 @@ ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm0 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; CHECK-SSE-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE-NEXT: psrld $16, %xmm0 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: addps %xmm2, %xmm0 -; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; CHECK-AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] -; CHECK-AVX2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-NO-FASTFMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0 -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0] -; CHECK-NO-FASTFMA-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vzeroupper ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: @@ -1800,58 +1165,15 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,1] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: movapd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fmul <2 x double> , %conv @@ -2073,112 +1395,42 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_safe: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double - %mul = fmul double 9.745314e+288, %conv - ret double %mul -} - -define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { -; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1] -; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm2 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: psllq %xmm0, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295] -; CHECK-SSE-NEXT: andpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: psrlq $32, %xmm1 -; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: addpd %xmm0, %xmm1 -; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] -; CHECK-SSE-NEXT: divpd %xmm1, %xmm0 -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-AVX2-NEXT: # xmm1 = mem[0,0] -; CHECK-AVX2-NEXT: vdivpd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-NO-FASTFMA-NEXT: # xmm1 = mem[0,0] -; CHECK-NO-FASTFMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq + %mul = fmul double 9.745314e+288, %conv + ret double %mul +} + +define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { +; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: psllq $52, %xmm0 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-SSE-NEXT: psubq %xmm0, %xmm1 +; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE-NEXT: retq ; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-FMA-NEXT: # xmm1 = mem[0,0] -; CHECK-FMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv @@ -2188,92 +1440,19 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,1] -; CHECK-SSE-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm1, %xmm2 -; CHECK-SSE-NEXT: psllq %xmm0, %xmm3 -; CHECK-SSE-NEXT: movq %xmm3, %rax -; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB21_1 -; CHECK-SSE-NEXT: # %bb.2: -; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: jmp .LBB21_3 -; CHECK-SSE-NEXT: .LBB21_1: -; CHECK-SSE-NEXT: movq %rax, %rcx -; CHECK-SSE-NEXT: shrq %rcx -; CHECK-SSE-NEXT: andl $1, %eax -; CHECK-SSE-NEXT: orq %rcx, %rax -; CHECK-SSE-NEXT: xorps %xmm1, %xmm1 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: addss %xmm1, %xmm1 -; CHECK-SSE-NEXT: .LBB21_3: -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-SSE-NEXT: movq %xmm0, %rax -; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB21_4 -; CHECK-SSE-NEXT: # %bb.5: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: jmp .LBB21_6 -; CHECK-SSE-NEXT: .LBB21_4: -; CHECK-SSE-NEXT: movq %rax, %rcx -; CHECK-SSE-NEXT: shrq %rcx -; CHECK-SSE-NEXT: andl $1, %eax -; CHECK-SSE-NEXT: orq %rcx, %rax -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-SSE-NEXT: addss %xmm0, %xmm0 -; CHECK-SSE-NEXT: .LBB21_6: -; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u> -; CHECK-SSE-NEXT: divps %xmm1, %xmm0 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; CHECK-SSE-NEXT: pslld $23, %xmm1 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm0 = <1065353216,1065353216,u,u> +; CHECK-SSE-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vmovq %xmm1, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1 -; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NO-FASTFMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x float> %mul = fdiv <2 x float> , %conv @@ -2521,72 +1700,19 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pushq %rax -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: popq %rax +; CHECK-SSE-NEXT: shll $10, %edi +; CHECK-SSE-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-SSE-NEXT: subl %edi, %eax +; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shll $10, %edi +; CHECK-AVX-NEXT: movl $28672, %eax # imm = 0x7000 +; CHECK-AVX-NEXT: subl %edi, %eax +; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH7000, %conv @@ -2596,72 +1722,19 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pushq %rax -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: movzwl %ax, %eax -; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: callq __extendhfsf2@PLT -; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm0, %xmm1 -; CHECK-SSE-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE-NEXT: callq __truncsfhf2@PLT -; CHECK-SSE-NEXT: popq %rax +; CHECK-SSE-NEXT: shll $10, %edi +; CHECK-SSE-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-SSE-NEXT: subl %edi, %eax +; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: popq %rax -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds2: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shll $10, %edi +; CHECK-AVX-NEXT: movl $18432, %eax # imm = 0x4800 +; CHECK-AVX-NEXT: subl %edi, %eax +; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half %mul = fdiv half 0xH4800, %conv @@ -2746,45 +1819,21 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: cvtsi2sd %rax, %xmm1 -; CHECK-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-SSE-NEXT: divsd %xmm1, %xmm0 +; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 +; CHECK-SSE-NEXT: subq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-AVX2-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NO-FASTFMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-FMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 +; CHECK-AVX-NEXT: subq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double %mul = fdiv double 0x36A0000000000000, %conv @@ -2842,45 +1891,19 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %ecx -; CHECK-SSE-NEXT: movl $1, %eax -; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-SSE-NEXT: shll %cl, %eax -; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-NEXT: divss %xmm1, %xmm0 +; CHECK-SSE-NEXT: shll $23, %edi +; CHECK-SSE-NEXT: movl $285212672, %eax # imm = 0x11000000 +; CHECK-SSE-NEXT: subl %edi, %eax +; CHECK-SSE-NEXT: movd %eax, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_okay: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: movl %edi, %ecx -; CHECK-AVX2-NEXT: movl $1, %eax -; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_okay: -; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx -; CHECK-NO-FASTFMA-NEXT: movl $1, %eax -; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NO-FASTFMA-NEXT: retq -; -; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_okay: -; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: movl $1, %eax -; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_okay: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: shll $23, %edi +; CHECK-AVX-NEXT: movl $285212672, %eax # imm = 0x11000000 +; CHECK-AVX-NEXT: subl %edi, %eax +; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float %mul = fdiv float 0x3a20000000000000, %conv