Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -347,6 +347,10 @@ /// These are IR-level optimization flags that may be propagated to SDNodes. /// TODO: This data structure should be shared by the IR optimizer and the /// the backend. +/// Propagation of Flags from Instruction to SDNode is done by +/// SDNodeFlagsAcquirer after the DAG node is created. Any flags which are set +/// during the Build DAG are eventually merged with flags which are present over +/// Instruction (IR). struct SDNodeFlags { private: // This bit is used to determine if the flags are in a defined state. @@ -354,6 +358,16 @@ // are defined. bool AnyDefined : 1; + // Following two bit are used for Flags propagation from + // a DAG node to its operands. When Propagate bit is set then + // Flags from DAG node are propagated to only those operands which + // have their Acquire bit set. + // These bits are set by invocation of + // SDNodeFlagsAcquirer::PropagateFlagsToOperands and reset once the + // propagation is through. + bool PropagateFlagsToOperands : 1; + bool AcquireFlagsFromUser : 1; + bool NoUnsignedWrap : 1; bool NoSignedWrap : 1; bool Exact : 1; @@ -368,57 +382,60 @@ public: /// Default constructor turns off all optimization flags. SDNodeFlags() - : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false), + : AnyDefined(false), PropagateFlagsToOperands(false), + AcquireFlagsFromUser(false), NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), UnsafeAlgebra(false), NoNaNs(false), NoInfs(false), NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false), AllowContract(false) {} /// Sets the state of the flags to the defined state. - void setDefined() { AnyDefined = true; } + void setDefined(bool Val) { AnyDefined = Val; } /// Returns true if the flags are in a defined state. bool isDefined() const { return AnyDefined; } // These are mutators for each flag. - void setNoUnsignedWrap(bool b) { - setDefined(); + void setNoUnsignedWrap(bool b, bool Commit = true) { + setDefined(Commit); NoUnsignedWrap = b; } - void setNoSignedWrap(bool b) { - setDefined(); + void setNoSignedWrap(bool b, bool Commit = true) { + setDefined(Commit); NoSignedWrap = b; } - void setExact(bool b) { - setDefined(); + void setExact(bool b, bool Commit = true) { + setDefined(Commit); Exact = b; } - void setUnsafeAlgebra(bool b) { - setDefined(); + void setUnsafeAlgebra(bool b, bool Commit = true) { + setDefined(Commit); UnsafeAlgebra = b; } - void setNoNaNs(bool b) { - setDefined(); + void setNoNaNs(bool b, bool Commit = true) { + setDefined(Commit); NoNaNs = b; } - void setNoInfs(bool b) { - setDefined(); + void setNoInfs(bool b, bool Commit = true) { + setDefined(Commit); NoInfs = b; } - void setNoSignedZeros(bool b) { - setDefined(); + void setNoSignedZeros(bool b, bool Commit = true) { + setDefined(Commit); NoSignedZeros = b; } - void setAllowReciprocal(bool b) { - setDefined(); + void setAllowReciprocal(bool b, bool Commit = true) { + setDefined(Commit); AllowReciprocal = b; } - void setVectorReduction(bool b) { - setDefined(); + void setVectorReduction(bool b, bool Commit = true) { + setDefined(Commit); VectorReduction = b; } - void setAllowContract(bool b) { - setDefined(); + void setAllowContract(bool b, bool Commit = true) { + setDefined(Commit); AllowContract = b; } + void setAcquireFlagsFromUser(bool b) { AcquireFlagsFromUser = b; } + void setPropagateFlagsToOperands(bool b) { PropagateFlagsToOperands = b; } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } @@ -432,6 +449,9 @@ bool hasVectorReduction() const { return VectorReduction; } bool hasAllowContract() const { return AllowContract; } + bool hasPropagateFlagsToOperands() const { return PropagateFlagsToOperands; } + bool hasAcquireFlagsFromUser() const { return AcquireFlagsFromUser; } + /// Clear any flags in this flag set that aren't also set in Flags. /// If the given Flags are undefined then don't do anything. void intersectWith(const SDNodeFlags Flags) { @@ -447,7 +467,25 @@ AllowReciprocal &= Flags.AllowReciprocal; VectorReduction &= Flags.VectorReduction; AllowContract &= Flags.AllowContract; + AnyDefined = true; } + + void mergeWith(const SDNodeFlags Flags) { + if (!Flags.isDefined()) + return; + NoUnsignedWrap |= Flags.NoUnsignedWrap; + NoSignedWrap |= Flags.NoSignedWrap; + Exact |= Flags.Exact; + UnsafeAlgebra |= Flags.UnsafeAlgebra; + NoNaNs |= Flags.NoNaNs; + NoInfs |= Flags.NoInfs; + NoSignedZeros |= Flags.NoSignedZeros; + AllowReciprocal |= Flags.AllowReciprocal; + VectorReduction |= Flags.VectorReduction; + AllowContract |= Flags.AllowContract; + AnyDefined = true; + } + }; /// Represents one node in the SelectionDAG. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8145,8 +8145,8 @@ unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); + SDNodeFlags Flags = LN0->getFlags(); + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, DL, PtrType), @@ -9566,7 +9566,7 @@ // Floating-point multiply-add without intermediate rounding. bool HasFMA = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // Floating-point multiply-add with intermediate rounding. This can result @@ -9700,7 +9700,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.hasUnsafeAlgebra()) { // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); @@ -9846,7 +9846,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.hasUnsafeAlgebra()) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) return N0; @@ -9911,7 +9911,7 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.hasUnsafeAlgebra()) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) return N1; @@ -10039,7 +10039,6 @@ EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - // Constant fold FMA. if (isa(N0) && isa(N1) && @@ -10047,7 +10046,9 @@ return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } - if (Options.UnsafeFPMath) { + SDNodeFlags Flags = N->getFlags(); + bool UnsafeFPMath = Options.UnsafeFPMath || Flags.hasUnsafeAlgebra(); + if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) @@ -10064,12 +10065,7 @@ !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - // TODO: FMA nodes should have flags that propagate to the created nodes. - // For now, create a Flags object for use with all unsafe math transforms. - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); - - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && @@ -10107,7 +10103,7 @@ } } - if (Options.UnsafeFPMath) { + if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, @@ -10214,7 +10210,8 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + bool UnsafeFPMath = Options.UnsafeFPMath || Flags.hasUnsafeAlgebra(); + if (UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. @@ -10330,11 +10327,7 @@ if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); - // TODO: FSQRT nodes should have flags that propagate to the created nodes. - // For now, create a Flags object for use with all unsafe math transforms. - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); - return buildSqrtEstimate(N0, Flags); + return buildSqrtEstimate(N0, N->getFlags()); } /// copysign(x, fp_extend(y)) -> copysign(x, y) @@ -14394,7 +14387,7 @@ Mask[i] = Vec2Offset + ExtIndex; } } - + // The type the input vectors may have changed above. InVT1 = VecIn1.getValueType(); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -661,6 +661,10 @@ SDValue getValue(const Value *V); bool findValue(const Value *V) const; + // Returns DAG node of SDValue present in NodeMap for + // a given Value. + SDNode *getDAGNode(const Value *); + SDValue getNonRegisterValue(const Value *V); SDValue getValueImpl(const Value *V); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -84,6 +84,87 @@ "for some float libcalls"), cl::location(LimitFloatPrecision), cl::init(0)); + +static bool isVectorReductionOp(const User *I); + +/// This class is used for propagating Flags from Instruction to SDNode. +/// These flags are later used by accessing SDNode during different +/// DAG phases. +/// Propagation is done once the DAG node is created. Any flag which is +/// applied during Build DAG phase is eventually merged with the flags +/// over Instruction. Since a DAG node could be shared b/w multiple Instructions +/// thus flags held by node are intersection of flags contributed by +/// each instruction. +class SDNodeFlagsAcquirer { +public: + SDNodeFlagsAcquirer(const Instruction *I, SelectionDAGBuilder *SDB) + : Instr(I), SelDB(SDB) {} + + ~SDNodeFlagsAcquirer() { + SDNode *Node = SelDB->getDAGNode(Instr); + if (Node) { + SDNodeFlags InstrFlags; + SDNodeFlags Flags = Node->getFlags(); + bool PropFlagsToOperands = Flags.hasPropagateFlagsToOperands(); + + if (isa(*Instr)) { + InstrFlags.setNoNaNs(Instr->hasNoNaNs()); + InstrFlags.setNoInfs(Instr->hasNoInfs()); + InstrFlags.setUnsafeAlgebra(Instr->hasUnsafeAlgebra()); + InstrFlags.setNoSignedZeros(Instr->hasNoSignedZeros()); + InstrFlags.setAllowContract(Instr->hasAllowContract()); + InstrFlags.setAllowReciprocal(Instr->hasAllowReciprocal()); + } + + if (auto *OFBinOp = dyn_cast(Instr)) { + InstrFlags.setNoSignedWrap(OFBinOp->hasNoSignedWrap()); + InstrFlags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap()); + } + + if (auto *ExactOp = dyn_cast(Instr)) + InstrFlags.setExact(ExactOp->isExact()); + + if (isVectorReductionOp(Instr)) + InstrFlags.setVectorReduction(true); + + Flags.setAcquireFlagsFromUser(false); + Flags.setPropagateFlagsToOperands(false); + + if (!Flags.isDefined()) + Flags.mergeWith(InstrFlags); + else + Flags.intersectWith(InstrFlags); + + Node->setFlags(Flags); + if (PropFlagsToOperands) + std::for_each(Node->op_begin(), Node->op_end(), + [&](const SDValue &Val) { + if (Val.getNode()->getFlags().hasAcquireFlagsFromUser()) + Val.getNode()->setFlags(Node->getFlags()); + }); + } + } + + // This function sets the Propagation bit over Parent DAG Node + // and Acquire bit over Operand DAG node[s] which inherits the + // flags from its parent. + static void PropagateFlagsToOperands(SDValue &Parent, + ArrayRef Operands) { + SDNodeFlags PFlags = Parent.getNode()->getFlags(); + PFlags.setPropagateFlagsToOperands(true); + Parent.getNode()->setFlags(PFlags); + + SDNodeFlags CFlags; + CFlags.setAcquireFlagsFromUser(true); + for (auto &Val : Operands) + Val.getNode()->setFlags(CFlags); + } + +private: + const Instruction *Instr; + SelectionDAGBuilder *SelDB; +}; + // Limit the width of DAG chains. This is important in general to prevent // DAG-based analysis from blowing up. For example, alias analysis and // load clustering may not complete in reasonable time. It is difficult to @@ -977,6 +1058,8 @@ } void SelectionDAGBuilder::visit(const Instruction &I) { + SDNodeFlagsAcquirer Flags(&I, this); + // Set up outgoing PHI node register values before emitting the terminator. if (isa(&I)) { HandlePHINodesInSuccessorBlocks(I.getParent()); @@ -1058,6 +1141,12 @@ return Result; } +SDNode * SelectionDAGBuilder::getDAGNode(const Value *V) { + if (NodeMap.find(V) == NodeMap.end()) + return nullptr; + return NodeMap[V].getNode(); +} + /// getValue - Return an SDValue for the given Value. SDValue SelectionDAGBuilder::getValue(const Value *V) { // If we already have an SDValue for this value, use it. It's important @@ -1423,7 +1512,7 @@ // An aggregate return value cannot wrap around the address space, so // offsets to its parts don't wrap either. SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); SmallVector Chains(NumValues); for (unsigned i = 0; i != NumValues; ++i) { @@ -2637,42 +2726,11 @@ SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); - bool nuw = false; - bool nsw = false; - bool exact = false; - bool vec_redux = false; - FastMathFlags FMF; - - if (const OverflowingBinaryOperator *OFBinOp = - dyn_cast(&I)) { - nuw = OFBinOp->hasNoUnsignedWrap(); - nsw = OFBinOp->hasNoSignedWrap(); - } - if (const PossiblyExactOperator *ExactOp = - dyn_cast(&I)) - exact = ExactOp->isExact(); - if (const FPMathOperator *FPOp = dyn_cast(&I)) - FMF = FPOp->getFastMathFlags(); - - if (isVectorReductionOp(&I)) { - vec_redux = true; + if (isVectorReductionOp(&I)) DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n"); - } - - SDNodeFlags Flags; - Flags.setExact(exact); - Flags.setNoSignedWrap(nsw); - Flags.setNoUnsignedWrap(nuw); - Flags.setVectorReduction(vec_redux); - Flags.setAllowReciprocal(FMF.allowReciprocal()); - Flags.setAllowContract(FMF.allowContract()); - Flags.setNoInfs(FMF.noInfs()); - Flags.setNoNaNs(FMF.noNaNs()); - Flags.setNoSignedZeros(FMF.noSignedZeros()); - Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), - Op1, Op2, Flags); + Op1, Op2); setValue(&I, BinNodeValue); } @@ -2705,27 +2763,7 @@ Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32); } - bool nuw = false; - bool nsw = false; - bool exact = false; - - if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) { - - if (const OverflowingBinaryOperator *OFBinOp = - dyn_cast(&I)) { - nuw = OFBinOp->hasNoUnsignedWrap(); - nsw = OFBinOp->hasNoSignedWrap(); - } - if (const PossiblyExactOperator *ExactOp = - dyn_cast(&I)) - exact = ExactOp->isExact(); - } - SDNodeFlags Flags; - Flags.setExact(exact); - Flags.setNoSignedWrap(nsw); - Flags.setNoUnsignedWrap(nuw); - SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, - Flags); + SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2); setValue(&I, Res); } @@ -2733,11 +2771,7 @@ SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); - SDNodeFlags Flags; - Flags.setExact(isa(&I) && - cast(&I)->isExact()); - setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, - Op2, Flags)); + setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, Op2)); } void SelectionDAGBuilder::visitICmp(const User &I) { @@ -3358,7 +3392,7 @@ // interpreted as signed, assume there is no unsigned overflow. SDNodeFlags Flags; if (int64_t(Offset) >= 0 && cast(I).isInBounds()) - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, DAG.getConstant(Offset, dl, N.getValueType()), Flags); @@ -3389,7 +3423,7 @@ // interpreted as signed, assume there is no unsigned overflow. SDNodeFlags Flags; if (Offs.isNonNegative() && cast(I).isInBounds()) - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags); continue; @@ -3466,7 +3500,7 @@ // by add SA-1 to the size. This doesn't overflow because we're computing // an address inside an alloca. SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, DAG.getIntPtrConstant(StackAlign - 1, dl), Flags); @@ -3550,7 +3584,7 @@ // An aggregate load cannot wrap around the address space, so offsets to its // parts don't wrap either. SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); SmallVector Values(NumValues); SmallVector Chains(std::min(MaxParallelChains, NumValues)); @@ -3718,7 +3752,7 @@ // An aggregate load cannot wrap around the address space, so offsets to its // parts don't wrap either. SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); + Flags.setNoUnsignedWrap(true, false); unsigned ChainI = 0; for (unsigned i = 0; i != NumValues; ++i, ++ChainI) { @@ -5501,6 +5535,8 @@ getValue(I.getArgOperand(0)).getValueType(), Mul, getValue(I.getArgOperand(2))); + + SDNodeFlagsAcquirer::PropagateFlagsToOperands(Add,{Mul}); setValue(&I, Add); } return nullptr; @@ -7920,8 +7956,6 @@ FastMathFlags FMF; if (isa(I)) FMF = I.getFastMathFlags(); - SDNodeFlags SDFlags; - SDFlags.setNoNaNs(FMF.noNaNs()); switch (Intrinsic) { case Intrinsic::experimental_vector_reduce_fadd: @@ -7964,11 +7998,11 @@ Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); break; case Intrinsic::experimental_vector_reduce_fmax: { - Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1); break; } case Intrinsic::experimental_vector_reduce_fmin: { - Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1); break; } default: Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2946,7 +2946,7 @@ DAG.getConstant(ShAmt, dl, TLI.getShiftAmountTy(Op1.getValueType(), DAG.getDataLayout())); SDNodeFlags Flags; - Flags.setExact(true); + Flags.setExact(Op1.getNode()->getFlags().hasExact()); Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, Flags); Created.push_back(Op1.getNode()); d.ashrInPlace(ShAmt); Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4961,8 +4961,7 @@ SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); + SDNodeFlags Flags = Operand.getNode()->getFlags(); // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -5001,8 +5000,7 @@ SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); + SDNodeFlags Flags = Operand.getNode()->getFlags(); // Newton reciprocal iteration: E * (2 - X * E) // AArch64 reciprocal iteration instruction: (2 - M * N) Index: test/CodeGen/X86/fmf-flags.ll =================================================================== --- test/CodeGen/X86/fmf-flags.ll +++ test/CodeGen/X86/fmf-flags.ll @@ -7,9 +7,12 @@ define float @fast_recip_sqrt(float %x) { ; X64-LABEL: fast_recip_sqrt: ; X64: # BB#0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fast_recip_sqrt: @@ -29,18 +32,13 @@ define float @fast_fmuladd_opts(float %a , float %b , float %c) { ; X64-LABEL: fast_fmuladd_opts: ; X64: # BB#0: -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: addss %xmm1, %xmm1 -; X64-NEXT: addss %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fast_fmuladd_opts: ; X86: # BB#0: ; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: fld %st(0) -; X86-NEXT: fadd %st(1) -; X86-NEXT: faddp %st(1) +; X86-NEXT: fmuls {{\.LCPI.*}} ; X86-NEXT: retl %res = call fast float @llvm.fmuladd.f32(float %a, float 2.0, float %a) ret float %res @@ -55,7 +53,7 @@ ; X64: # BB#0: ; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: mulsd %xmm0, %xmm1 -; X64-NEXT: addsd %xmm1, %xmm0 +; X64-NEXT: mulsd {{.*}}(%rip), %xmm0 ; X64-NEXT: movsd %xmm1, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -64,7 +62,9 @@ ; X86-NEXT: fldl {{[0-9]+}}(%esp) ; X86-NEXT: fld %st(0) ; X86-NEXT: fmull {{\.LCPI.*}} -; X86-NEXT: fadd %st(0), %st(1) +; X86-NEXT: fxch %st(1) +; X86-NEXT: fmull {{\.LCPI.*}} +; X86-NEXT: fxch %st(1) ; X86-NEXT: fstpl mul1 ; X86-NEXT: retl %m = fmul double %x, 4.2 @@ -80,10 +80,14 @@ define float @not_so_fast_recip_sqrt(float %x) { ; X64-LABEL: not_so_fast_recip_sqrt: ; X64: # BB#0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 -; X64-NEXT: movss %xmm1, {{.*}}(%rip) +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: sqrtss %xmm0, %xmm2 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: movss %xmm2, {{.*}}(%rip) ; X64-NEXT: retq ; ; X86-LABEL: not_so_fast_recip_sqrt: