Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -71,7 +71,10 @@ FMLSv2f32_OP2, FMLSv2f64_OP2, FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv4f32_OP2, + + // This is FDIV-RECIP pattern matched by X86 machine combiner + Div2RecipEst }; } // end namespace llvm Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -286,7 +286,8 @@ /// based on the function's attributes. If the operation is not overridden by /// the function's attributes, "Unspecified" is returned and target defaults /// are expected to be used for instruction selection. - int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const; + virtual int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF, + bool forDAGF = true) const; /// Return the refinement step count for a square root of the given type based /// on the function's attributes. If the operation is not overridden by Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -155,9 +155,16 @@ assert(DefInstr && "There must be a definition for a new virtual register"); DepthOp = InstrDepth[II->second]; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); + int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); + assert((DefIdx || UseIdx) && "Invalid reg usage"); + if (DefIdx < 0 || UseIdx < 0) + // W/o def/use indexes we can't compute latency based on shed model + // that's why we're forced to use the default value + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { @@ -267,8 +274,12 @@ // dependency cycles) in the critical path to proceed with the transform. // Being conservative also protects against inaccuracies in the underlying // machine trace metrics and CPU models. - if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) + if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { + DEBUG(dbgs() << "It MustReduceDepth "); + DEBUG(NewRootDepth < RootDepth ? dbgs() << "and it does it\n" + : dbgs() << "but it does NOT do it\n"); return NewRootDepth < RootDepth; + } // A more flexible cost calculation for the critical path includes the slack // of the original code sequence. This may allow the transform to proceed @@ -282,16 +293,18 @@ unsigned RootSlack = BlockTrace.getInstrSlack(*Root); + unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; + DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; dbgs() << " RootLatency: " << RootLatency << "\n"; dbgs() << " RootSlack: " << RootSlack << "\n"; - dbgs() << " NewRootDepth + NewRootLatency = " - << NewRootDepth + NewRootLatency << "\n"; - dbgs() << " RootDepth + RootLatency + RootSlack = " - << RootDepth + RootLatency + RootSlack << "\n";); - - unsigned NewCycleCount = NewRootDepth + NewRootLatency; - unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; + dbgs() << " NewRootDepth + NewRootLatency = " << NewCycleCount << "\n"; + dbgs() << " RootDepth + RootLatency + RootSlack = " << OldCycleCount + << "\n";); + DEBUG(NewCycleCount <= OldCycleCount + ? dbgs() << "It improves PathLen\n" + : dbgs() << "It does NOT improve PathLen"); return NewCycleCount <= OldCycleCount; } @@ -340,6 +353,9 @@ DEBUG(dbgs() << "RESOURCE DATA: \n"; dbgs() << " resource len before: " << ResLenBeforeCombine << " after: " << ResLenAfterCombine << "\n";); + DEBUG(ResLenAfterCombine <= ResLenBeforeCombine + ? dbgs() << "It preserves ResourceLen\n" + : dbgs() << "It does NOT preserve ResourceLen\n"); return ResLenAfterCombine <= ResLenBeforeCombine; } Index: lib/CodeGen/MachineTraceMetrics.cpp =================================================================== --- lib/CodeGen/MachineTraceMetrics.cpp +++ lib/CodeGen/MachineTraceMetrics.cpp @@ -499,6 +499,7 @@ } /// Invalidate traces through BadMBB. +// TODO: this code should be refactored because it really increases compile time void MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) { SmallVector WorkList; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -2079,8 +2079,9 @@ return getOpEnabled(true, VT, getRecipEstimateForFunc(MF)); } -int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT, - MachineFunction &MF) const { +int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF, + bool forDAGCombiner) const { + return getOpEnabled(false, VT, getRecipEstimateForFunc(MF)); } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1281,6 +1281,14 @@ int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override; + /// Return a ReciprocalEstimate enum value for a division of the given type + /// based on the function's attributes. If the operation is not overridden + /// by + /// the function's attributes, "Unspecified" is returned and target defaults + /// are expected to be used for instruction selection. + int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF, + bool forDAG) const override; + /// Use rcp* to speed up fdiv calculations. SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16348,6 +16348,17 @@ return SDValue(); } +/// Return a ReciprocalEstimate enum value for a division of the given type +/// based on the function's attributes. If the operation is not overridden by +/// the function's attributes, "Unspecified" is returned and target defaults +/// are expected to be used for instruction selection. +int X86TargetLowering::getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF, + bool forDAG) const { + if (!Subtarget.hasAVX512() && forDAG) + return ReciprocalEstimate::Disabled; + return TargetLoweringBase::getRecipEstimateDivEnabled(VT, MF); +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -494,6 +494,21 @@ return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -9757,6 +9757,390 @@ } } +// The dividend could be ExactlyOne value and in this case we should not create +// additional constant for reciprocal division but use the dividend instead. +// We're trying to find the dividend definition and if it is a constant +// ExactlyOne value we'll use it. +static bool isDividendExactlyOne(MachineFunction &MF, unsigned DividendReg) { + if (MachineInstr *MI = MF.getRegInfo().getUniqueVRegDef(DividendReg)) { + auto Constants = + MI->getParent()->getParent()->getConstantPool()->getConstants(); + for (auto &MO : MI->operands()) { + if (MO.isCPI()) { + // We have a Constant Pool Index operand in this instruction + // FIXME: should we deal with other types of operand like Immediate? + auto ConstantEntry = Constants[MO.getIndex()]; + // FIXME: what should we do with MachineConstantPoolEntry? + if (!ConstantEntry.isMachineConstantPoolEntry()) { + if (auto *C = dyn_cast(ConstantEntry.Val.ConstVal)) { + if (C->getType()->isVectorTy()) { + if (!(C = C->getSplatValue())) + return false; + } + if (auto *CFP = dyn_cast(C)) + return CFP->isExactlyValue(1.0); + } + } + } + } + } + return false; +} + +static EVT getFDivEVT(MachineInstr &Root) { + // FIXME: should we support other kinds of DIV? + switch (Root.getOpcode()) { + default: + break; + case X86::DIVSSrr: // f32 + case X86::VDIVSSrr: // f32 + return MVT::f32; + case X86::DIVPSrr: // v4f32 + case X86::VDIVPSrr: // v4f32 + return MVT::v4f32; + case X86::VDIVPSYrr: // v8f32 + return MVT::v8f32; + } + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + +/// genReciprocalDiv - Generates A = B * 1/C instead of A = B/C +/// (at the moment we support float types only: f32, v4f32 and 8f32) +/// TODO: Should we support double types for the latest CPUs? +/// +/// To get more precision we're using Newton-Raphson iterations like here: +/// +/// X[0] = reciprocal (C); +/// X[i+1] = X[i] + X[i] * (1 - C * X[i]); every iteration increases precision +/// +/// In theory if we know that X[0] is accurate to N bits, the result of +/// iteration k will be accurate to almost 2^k*N bits. For x86 it means: +/// X[0] = 11 bits +/// X[1] = 22 bits +/// x[2] = 44 bits +/// etc. +/// +/// And the result of division will be here: A = B * X +/// Example (-x86-asm-syntax=intel): instead of +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vdivss xmm0, xmm1, xmm0 +/// +/// we're generating +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vrcpss xmm2, xmm0, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vsubss xmm0, xmm1, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vaddss xmm0, xmm0, xmm2 + +#define FMA_INDEX 7 + +static void genReciprocalDiv(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, Type *Ty, + X86Subtarget &Subtarget) { + + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetLowering *TLI = Subtarget.getTargetLowering(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + int Iterations = TLI->getDivRefinementSteps(getFDivEVT(Root), MF); + + bool hasFMA = Subtarget.hasFMA(); + assert(!hasFMA || Instrs.size() > FMA_INDEX); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DividendReg = Root.getOperand(1).getReg(); + bool DividendIsExactlyOne = isDividendExactlyOne(MF, DividendReg); + unsigned DividerReg = Root.getOperand(2).getReg(); + bool DividerIsKill = Root.getOperand(2).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividendReg)) + MRI.constrainRegClass(DividendReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividerReg)) + MRI.constrainRegClass(DividerReg, RC); + + if (Iterations < 0) // all values >= 0 mean Iterations were defined explictly + Iterations = 1; // otherwise we use the default value + + // The bullets below (0,2,1,3,4,5,6) mean the indexes inside input Instrs + // The meaning of indexes(bullets) see below inside genAlternativeCodeSequence + // 0: rcp + // Initial estimate value is recipocal division of C + MachineInstrBuilder RcpMI; + // Iff DivIsRcp == true Then div ~= rcp without any additional refinement + bool DivIsRcp = DividendIsExactlyOne && !Iterations; + + unsigned RcpVReg; + if (!DivIsRcp) { + // We need refinement and only because of that we need this vreg + RcpVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(RcpVReg, 0)); + } + if (Instrs[0] == X86::VRCPSSr) + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + else + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + InsInstrs.push_back(RcpMI); + + unsigned LoadVReg = 0; + if (!DividendIsExactlyOne && Iterations) { + // 2: load (mov) + // We need all ones value to be able to do (1 - C * X[i]) + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (MF.getTarget().isPositionIndependent()) { + if (Subtarget.is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // a. GlobalBaseReg may have been spilled. + // b. It may not be live at MI. + return; + } + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + // const Constant *C = Constant::getAllOnesValue(Ty); + auto *CFP = ConstantFP::get(Ty, 1.0); + unsigned CPI = MCP.getConstantPoolIndex(CFP, 4); + LoadVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(LoadVReg, 0)); + + MachineInstrBuilder LoadMI; + auto &MIDesc = TII->get(Instrs[2]); + if (MIDesc.getNumOperands() == 6) + LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg) + .addReg(PICBase) + .addImm(1) + .addReg(0) + .addConstantPoolIndex(CPI) + .addReg(0); + else + LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg) + .addConstantPoolIndex(CPI); + InsInstrs.push_back(LoadMI); + } + unsigned EstVReg = RcpVReg; // X[0] = reciprocal (C); + + for (int i = 0; i < Iterations; i++) { + if (hasFMA) { + // 7: fnmadd + unsigned NFmaVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NFmaVReg, 0)); + MachineInstrBuilder NFmaMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[FMA_INDEX]), NFmaVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(DividendIsExactlyOne ? DividendReg : LoadVReg) + .addReg(EstVReg); + InsInstrs.push_back(NFmaMI); // 1 - C * X[i] + // 8: fmadd + MachineInstrBuilder FmaMI; + unsigned FmaVReg; + if (DividendIsExactlyOne && (i + 1 == Iterations)) + FmaVReg = ResultReg; + else { + FmaVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(FmaVReg, 0)); + } + FmaMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[FMA_INDEX + 1]), + FmaVReg) + .addReg(EstVReg) + .addReg(EstVReg) + .addReg(NFmaVReg); + InsInstrs.push_back(FmaMI); // X[i] + X[i] * (1 - C * X[i]) + EstVReg = FmaVReg; + } else { + // 1: mul + unsigned MulVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MulVReg, 0)); + MachineInstrBuilder MulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), MulVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(EstVReg); + InsInstrs.push_back(MulMI); // C * X[i] + + // 3: sub + unsigned SubVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubVReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3]), SubVReg) + .addReg(DividendIsExactlyOne ? DividendReg : LoadVReg) + .addReg(MulVReg); + InsInstrs.push_back(SubMI); // 1 - C * X[i] + + // 4: mul2 + unsigned Mul2VReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(Mul2VReg, 0)); + MachineInstrBuilder Mul2MI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), Mul2VReg) + .addReg(SubVReg) + .addReg(EstVReg); + InsInstrs.push_back(Mul2MI); // X[i] * (1 - C * X[i]) + + // 5: add + MachineInstrBuilder AddMI; + unsigned AddVReg; + if (DividendIsExactlyOne && (i + 1 == Iterations)) + AddVReg = ResultReg; + else { + AddVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(AddVReg, 0)); + } + AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), AddVReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + InsInstrs.push_back(AddMI); // X[i] + X[i] * (1 - C * X[i]) + EstVReg = AddVReg; + } + } + if (!DividendIsExactlyOne) { + // 6: result mul + // The final multiplication B * 1/C + MachineInstrBuilder ResultMulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[6]), ResultReg) + .addReg(DividendReg) + .addReg(EstVReg); + InsInstrs.push_back(ResultMulMI); + } + return; +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineFunction &MF = *MBB.getParent(); + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::Div2RecipEst: + switch (Root.getOpcode()) { + default: + return; + case X86::VDIVSSrr: // f32 + genReciprocalDiv(Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPSSr, X86::VMULSSrr, X86::VMOVSSrm, + X86::VSUBSSrr, X86::VMULSSrr, X86::VADDSSrr, + X86::VMULSSrr, X86::VFNMADD132SSr, + X86::VFMADD132SSr}, // FMA support at FMA_INDEX + Type::getFloatTy(MF.getFunction()->getContext()), + Subtarget); + break; + case X86::VDIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSr, X86::VMULPSrr, X86::VMOVAPSrm, X86::VSUBPSrr, + X86::VMULPSrr, X86::VADDPSrr, X86::VMULPSrr, X86::VFNMADD132PSr, + X86::VFMADD132PSr}, // FMA support at FMA_INDEX + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + case X86::VDIVPSYrr: // v8f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSYr, X86::VMULPSYrr, X86::VMOVAPSYrm, X86::VSUBPSYrr, + X86::VMULPSYrr, X86::VADDPSYrr, X86::VMULPSYrr, X86::VFNMADD132PSYr, + X86::VFMADD132PSYr}, // FMA support at FMA_INDEX + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8), + Subtarget); + break; + case X86::DIVSSrr: // f32 + genReciprocalDiv(Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPSSr, X86::MULSSrr, X86::MOVSSrm, X86::SUBSSrr, + X86::MULSSrr, X86::ADDSSrr, X86::MULSSrr}, + Type::getFloatTy(MF.getFunction()->getContext()), + Subtarget); + break; + case X86::DIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPPSr, X86::MULPSrr, X86::MOVAPSrr, X86::SUBPSrr, X86::MULPSrr, + X86::ADDPSrr, X86::MULPSrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + } + break; + } + DEBUG(dbgs() << "\nAlternate sequence for " << MF.getName() << "\n"; + for (unsigned i = 0; i < InsInstrs.size(); i++) { + dbgs() << i << ": "; + InsInstrs[i]->print(dbgs(), false, MF.getSubtarget().getInstrInfo()); + }); + DelInstrs.push_back(&Root); // Record FDiv for deletion +} + +/// Find instructions that can be turned into recip +static bool getFDIVPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + // TODO: should we support other kinds of instructions? + case X86::VDIVSSrr: // f32 + case X86::VDIVPSrr: // v4f32 + case X86::VDIVPSYrr: // v8f32 + case X86::DIVSSrr: // f32 + case X86::DIVPSrr: // v4f32 + break; + } + auto *MF = Root.getParent()->getParent(); + auto TLI = MF->getSubtarget().getTargetLowering(); + EVT VT = getFDivEVT(Root); + if (VT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; + switch (TLI->getRecipEstimateDivEnabled(VT, *MF, /*forDAG*/ false)) { + case TLI->ReciprocalEstimate::Disabled: + return false; + case TLI->ReciprocalEstimate::Unspecified: + if (Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + break; + return false; + } + + Patterns.push_back(MachineCombinerPattern::Div2RecipEst); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getFDIVPatterns(Root, Patterns)) + return true; + // TODO: FSQRT patterns will be prepared after reciprocal implementation + // completes + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -37,9 +37,9 @@ define float @f32_one_step(float %x) #1 { ; SSE-LABEL: f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: addss %xmm2, %xmm1 @@ -48,12 +48,12 @@ ; ; AVX-RECIP-LABEL: f32_one_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step: @@ -65,22 +65,18 @@ ; ; BTVER2-LABEL: f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step: @@ -92,12 +88,8 @@ ; ; HASWELL-NO-FMA-LABEL: f32_one_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step: @@ -113,10 +105,10 @@ define float @f32_two_step(float %x) #2 { ; SSE-LABEL: f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subss %xmm3, %xmm4 ; SSE-NEXT: mulss %xmm2, %xmm4 @@ -130,80 +122,64 @@ ; ; AVX-RECIP-LABEL: f32_two_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm3, %xmm3 +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm3, %xmm2 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; FMA-RECIP-NEXT: vmovaps %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfmadd132ss %xmm2, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ss %xmm3, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; BTVER2-NEXT: vmulss %xmm2, %xmm3, %xmm3 +; BTVER2-NEXT: vaddss %xmm2, %xmm3, %xmm2 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; HASWELL-NEXT: vmovaps %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfmadd132ss %xmm2, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd132ss %xmm3, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step: @@ -276,9 +252,9 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps %xmm2, %xmm1 @@ -287,12 +263,12 @@ ; ; AVX-RECIP-LABEL: v4f32_one_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step: @@ -304,40 +280,33 @@ ; ; BTVER2-LABEL: v4f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm2 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NEXT: vrcpps %xmm0, %xmm2 +; HASWELL-NEXT: vfnmadd231ps %xmm2, %xmm0, %xmm1 +; HASWELL-NEXT: vfmadd132ps %xmm2, %xmm2, %xmm1 +; HASWELL-NEXT: vmovaps %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step: @@ -361,10 +330,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm2, %xmm4 @@ -378,80 +347,64 @@ ; ; AVX-RECIP-LABEL: v4f32_two_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vsubps %xmm3, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm3, %xmm3 +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm2 +; FMA-RECIP-NEXT: vmovaps %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfmadd132ps %xmm2, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ps %xmm3, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm2 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vsubps %xmm3, %xmm1, %xmm3 +; BTVER2-NEXT: vmulps %xmm2, %xmm3, %xmm3 +; BTVER2-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NEXT: vrcpps %xmm0, %xmm2 +; HASWELL-NEXT: vmovaps %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfmadd132ps %xmm2, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd132ps %xmm3, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_two_step: @@ -538,9 +491,9 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm4, %xmm3 @@ -556,12 +509,12 @@ ; ; AVX-RECIP-LABEL: v8f32_one_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step: @@ -573,40 +526,33 @@ ; ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NEXT: vfnmadd231ps %ymm2, %ymm0, %ymm1 +; HASWELL-NEXT: vfmadd132ps %ymm2, %ymm2, %ymm1 +; HASWELL-NEXT: vmovaps %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step: @@ -631,10 +577,10 @@ ; SSE-LABEL: v8f32_two_step: ; SSE: # BB#0: ; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm3, %xmm5 @@ -660,80 +606,64 @@ ; ; AVX-RECIP-LABEL: v8f32_two_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; AVX-RECIP-NEXT: vsubps %ymm3, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm3 +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; FMA-RECIP-NEXT: vmovaps %ymm0, %ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm3 +; FMA-RECIP-NEXT: vfmadd132ps %ymm2, %ymm2, %ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps %ymm3, %ymm1, %ymm0 ; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; BTVER2-NEXT: vsubps %ymm3, %ymm1, %ymm3 +; BTVER2-NEXT: vmulps %ymm2, %ymm3, %ymm3 +; BTVER2-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vmovaps %ymm0, %ymm3 +; HASWELL-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm3 +; HASWELL-NEXT: vfmadd132ps %ymm2, %ymm2, %ymm3 +; HASWELL-NEXT: vfnmadd132ps %ymm3, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_two_step: Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -38,8 +38,8 @@ ; ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_no_step_2: @@ -67,8 +67,8 @@ ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: ; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: addss %xmm2, %xmm1 @@ -79,20 +79,21 @@ ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm2 +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: @@ -101,39 +102,27 @@ ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2: @@ -150,25 +139,25 @@ define float @f32_one_step_2_divs(float %x) #1 { ; SSE-LABEL: f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: addss %xmm1, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: mulss %xmm2, %xmm1 +; SSE-NEXT: addss %xmm2, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: retq @@ -184,24 +173,20 @@ ; ; BTVER2-LABEL: f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq @@ -217,12 +202,8 @@ ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq @@ -244,9 +225,9 @@ ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: ; SSE-NEXT: rcpss %xmm0, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subss %xmm3, %xmm4 ; SSE-NEXT: mulss %xmm2, %xmm4 @@ -262,15 +243,15 @@ ; AVX-RECIP-LABEL: f32_two_step_2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vsubss %xmm3, %xmm2, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm3, %xmm3 +; AVX-RECIP-NEXT: vaddss %xmm1, %xmm3, %xmm1 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; @@ -278,69 +259,45 @@ ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ss %xmm1, %xmm2, %xmm3 ; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 +; FMA-RECIP-NEXT: vfnmadd132ss %xmm3, %xmm2, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm3 +; BTVER2-NEXT: vsubss %xmm3, %xmm2, %xmm3 +; BTVER2-NEXT: vmulss %xmm1, %xmm3, %xmm3 +; BTVER2-NEXT: vaddss %xmm1, %xmm3, %xmm1 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step_2: @@ -362,8 +319,8 @@ ; SSE-LABEL: v4f32_one_step2: ; SSE: # BB#0: ; SSE-NEXT: rcpps %xmm0, %xmm2 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps %xmm2, %xmm1 @@ -374,20 +331,21 @@ ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vfnmadd231ps %xmm1, %xmm0, %xmm2 +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm2 +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: @@ -396,40 +354,27 @@ ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step2: @@ -455,25 +400,25 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: addps %xmm2, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm2 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: retq @@ -489,46 +434,38 @@ ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm2 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NEXT: vrcpps %xmm0, %xmm2 +; HASWELL-NEXT: vfnmadd231ps %xmm2, %xmm0, %xmm1 +; HASWELL-NEXT: vfmadd132ps %xmm2, %xmm2, %xmm1 +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm0 +; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq @@ -560,9 +497,9 @@ ; SSE-LABEL: v4f32_two_step2: ; SSE: # BB#0: ; SSE-NEXT: rcpps %xmm0, %xmm2 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm2, %xmm4 @@ -578,15 +515,15 @@ ; AVX-RECIP-LABEL: v4f32_two_step2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vsubps %xmm3, %xmm2, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm3, %xmm3 +; AVX-RECIP-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-RECIP-NEXT: retq ; @@ -594,69 +531,45 @@ ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm3 ; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 +; FMA-RECIP-NEXT: vfnmadd132ps %xmm3, %xmm2, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm3 +; BTVER2-NEXT: vsubps %xmm3, %xmm2, %xmm3 +; BTVER2-NEXT: vmulps %xmm1, %xmm3, %xmm3 +; BTVER2-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_two_step2: @@ -689,41 +602,42 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: rcpps %xmm0, %xmm4 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm0, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vfnmadd231ps %ymm1, %ymm0, %ymm2 +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm2 +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: @@ -732,40 +646,27 @@ ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step2: @@ -791,34 +692,34 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subps %xmm0, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: rcpps %xmm1, %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: rcpps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: subps %xmm1, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 +; SSE-NEXT: addps %xmm3, %xmm4 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm3 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 +; SSE-NEXT: addps %xmm3, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: retq @@ -834,46 +735,38 @@ ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vfnmadd231ps %ymm2, %ymm0, %ymm1 +; HASWELL-NEXT: vfmadd132ps %ymm2, %ymm2, %ymm1 +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm0 +; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq @@ -904,48 +797,48 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: rcpps %xmm0, %xmm3 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm3, %xmm5 ; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm5, %xmm3 ; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 +; SSE-NEXT: rcpps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 ; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm2, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_two_step2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm3 +; AVX-RECIP-NEXT: vsubps %ymm3, %ymm2, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm3, %ymm3 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm3, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-RECIP-NEXT: retq ; @@ -953,69 +846,45 @@ ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 +; FMA-RECIP-NEXT: vmovaps %ymm0, %ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm3 ; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 +; FMA-RECIP-NEXT: vfnmadd132ps %ymm3, %ymm2, %ymm0 ; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm3 +; BTVER2-NEXT: vsubps %ymm3, %ymm2, %ymm3 +; BTVER2-NEXT: vmulps %ymm1, %ymm3, %ymm3 +; BTVER2-NEXT: vaddps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_two_step2: @@ -1098,9 +967,9 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq ; @@ -1124,20 +993,20 @@ ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_no_step2: