Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -71,7 +71,10 @@ FMLSv2f32_OP2, FMLSv2f64_OP2, FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv4f32_OP2, + + // This is FDIV-RECIP pattern matched by X86 machine combiner + Div2RecipEst }; } // end namespace llvm Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -153,9 +153,16 @@ assert(DefInstr && "There must be a definition for a new virtual register"); DepthOp = InstrDepth[II->second]; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); + int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); + assert((DefIdx || UseIdx) && "Invalid reg usage"); + if (DefIdx < 0 || UseIdx < 0) + // W/o def/use indexes we can't compute latency based on shed model + // that's why we're forced to use the default value + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15031,6 +15031,18 @@ /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { + + // This effort should be moved lower on Machine Combiner level because only + // there we know real estimation of fdiv and its reciprocal implementation + switch (DAG.getTarget().getTargetTriple().getArch()) { + default: + break; + // These architectures deal with reciprocal estimate on Machine Combiner level + case Triple::x86: + case Triple::x86_64: + return SDValue(); + } + if (Level >= AfterLegalizeDAG) return SDValue(); Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -504,6 +504,21 @@ return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -9448,6 +9448,356 @@ } } +static bool isDividendExactlyOne(MachineFunction &MF, unsigned DividendReg) { + // The dividend could be ExactlyOne value and in this case we should not + // create + // additional constant for reciprocal division but use the dividend instead. + // We're trying to find the dividend definition and if it's a constant + // ExactlyOne value we'll use it + if (MachineInstr *MI = MF.getRegInfo().getUniqueVRegDef(DividendReg)) { + auto Constants = + MI->getParent()->getParent()->getConstantPool()->getConstants(); + for (auto &MO : MI->operands()) { + if (MO.isCPI()) { + // We have a Constant Pool Index operand in this instruction + // FIXME: should we deal with other types of operand like Immediate? + auto ConstantEntry = Constants[MO.getIndex()]; + // FIXME: what should we do with MachineConstantPoolEntry? + if (!ConstantEntry.isMachineConstantPoolEntry()) { + if (auto *C = dyn_cast(ConstantEntry.Val.ConstVal)) { + if (C->getType()->isVectorTy()) { + if (!(C = C->getSplatValue())) + return false; + } + if (auto *CFP = dyn_cast(C)) + return CFP->isExactlyValue(1.0); + } + } + } + } + } + return false; +} + +static EVT getFDivEVT(MachineInstr &Root) { + // FIXME: should we support other kinds of DIV? + switch (Root.getOpcode()) { + default: + break; + case X86::DIVSSrr: // f32 + case X86::VDIVSSrr: // f32 + return MVT::f32; + case X86::DIVPSrr: // v4f32 + case X86::VDIVPSrr: // v4f32 + return MVT::v4f32; + case X86::VDIVPSYrr: // v8f32 + return MVT::v8f32; + } + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + +/// genFDivReciprocal - Generates A = B * 1/C instead of A = B/C +/// (at the moment we support float types only: f32, v4f32 and 8f32) +/// TODO: Should we support double types for the latest CPUs? +/// +/// To get more precision we're using Newton-Raphson iterations like here: +/// +/// X[0] = reciprocal (C); +/// X[i+1] = X[i] + X[i] * (1 - C * X[i]); every iteration increases precision +/// +/// In theory if we know that X[0] is accurate to N bits, the result of +/// iteration k will be accurate to almost 2^k*N bits. For x86 it means: +/// X[0] = 11 bits +/// X[1] = 22 bits +/// x[2] = 44 bits +/// etc. +/// +/// And the result of division will be here: A = B * X +/// Example (-x86-asm-syntax=intel): instead of +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vdivss xmm0, xmm1, xmm0 +/// +/// we're generating +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vrcpss xmm2, xmm0, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vsubss xmm0, xmm1, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vaddss xmm0, xmm0, xmm2 + +static void genReciprocalDiv(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, Type *Ty, + X86Subtarget &Subtarget) { + + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + int Iterations = TLI->getDivRefinementSteps(getFDivEVT(Root), MF); + + unsigned ResultReg = Root.getOperand(0).getReg(); + + unsigned DividendReg = Root.getOperand(1).getReg(); + bool DividendIsExactlyOne = isDividendExactlyOne(MF, DividendReg); + + unsigned DividerReg = Root.getOperand(2).getReg(); + bool DividerIsKill = Root.getOperand(2).isKill(); + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividendReg)) + MRI.constrainRegClass(DividendReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividerReg)) + MRI.constrainRegClass(DividerReg, RC); + + if (Iterations < + 0) // undefined means it was not defined explicitly via option + Iterations = 1; // or attribute that's why we use the default value + + // The bullets below (0,2,1,3,4,5,6) mean the indexes inside input Instrs + // 0: rcp + // Initial estimate value is recipocal division of C + MachineInstrBuilder RcpMI; + // Iff DivIsRcp == true Then div ~= rcp without any additional refinement + bool DivIsRcp = DividendIsExactlyOne && !Iterations; + + unsigned RcpVReg; + if (!DivIsRcp) { + // We need refinement and only because of that we need this vreg + RcpVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(RcpVReg, 0)); + } + if (Instrs[0] == X86::VRCPSSr) + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + else + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + InsInstrs.push_back(RcpMI); + + unsigned LoadVReg = 0; + if (!DividendIsExactlyOne && Iterations) { + // 2: load + // We need all ones value to be able to do (1 - C * X[i]) + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (MF.getTarget().isPositionIndependent()) { + if (Subtarget.is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return; + } + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + // const Constant *C = Constant::getAllOnesValue(Ty); + auto *CFP = ConstantFP::get(Ty, 1.0); + unsigned CPI = MCP.getConstantPoolIndex(CFP, 4); + LoadVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(LoadVReg, 0)); + + MachineInstrBuilder LoadMI; + auto &MIDesc = TII->get(Instrs[2]); + if (MIDesc.getNumOperands() == 6) + LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg) + .addReg(PICBase) + .addImm(1) + .addReg(0) + .addConstantPoolIndex(CPI) + .addReg(0); + else + LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg) + .addConstantPoolIndex(CPI); + InsInstrs.push_back(LoadMI); + } + unsigned EstVReg = RcpVReg; // X[0] = reciprocal (C); + + for (int i = 0; i < Iterations; i++) { + // 1: mul + unsigned MulVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MulVReg, 0)); + MachineInstrBuilder MulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), MulVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(EstVReg); + InsInstrs.push_back(MulMI); // C * X[i] + + // 3: sub + unsigned SubVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubVReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3]), SubVReg) + .addReg(DividendIsExactlyOne ? DividendReg : LoadVReg) + .addReg(MulVReg); + InsInstrs.push_back(SubMI); // 1 - C * X[i] + + // 4: mul2 + unsigned Mul2VReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(Mul2VReg, 0)); + MachineInstrBuilder Mul2MI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), Mul2VReg) + .addReg(SubVReg) + .addReg(EstVReg); + InsInstrs.push_back(Mul2MI); // X[i] * (1 - C * X[i]) + + // 5: add + MachineInstrBuilder AddMI; + if (DividendIsExactlyOne && (i + 1 == Iterations)) + AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), ResultReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + else { + unsigned AddVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(AddVReg, 0)); + AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), AddVReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + EstVReg = AddVReg; + } + InsInstrs.push_back(AddMI); // X[i] + X[i] * (1 - C * X[i]) + } + if (!DividendIsExactlyOne) { + // 6: result mul + // The final multiplication B * 1/C + MachineInstrBuilder ResultMulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[6]), ResultReg) + .addReg(DividendReg) + .addReg(EstVReg); + InsInstrs.push_back(ResultMulMI); + } + return; +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineFunction &MF = *MBB.getParent(); + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::Div2RecipEst: + switch (Root.getOpcode()) { + default: + return; + case X86::VDIVSSrr: // f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPSSr, X86::VMULSSrr, X86::VMOVSSrm, X86::VSUBSSrr, + X86::VMULSSrr, X86::VADDSSrr, X86::VMULSSrr}, + Type::getFloatTy(MF.getFunction()->getContext()), Subtarget); + break; + case X86::VDIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSr, X86::VMULPSrr, X86::VMOVAPSrm, X86::VSUBPSrr, + X86::VMULPSrr, X86::VADDPSrr, X86::VMULPSrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + case X86::VDIVPSYrr: // v8f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSYr, X86::VMULPSYrr, X86::VMOVAPSYrm, X86::VSUBPSYrr, + X86::VMULPSYrr, X86::VADDPSYrr, X86::VMULPSYrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8), + Subtarget); + break; + case X86::DIVSSrr: // f32 + genReciprocalDiv(Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPSSr, X86::MULSSrr, X86::MOVSSrm, X86::SUBSSrr, + X86::MULSSrr, X86::ADDSSrr, X86::MULSSrr}, + Type::getFloatTy(MF.getFunction()->getContext()), + Subtarget); + break; + case X86::DIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPPSr, X86::MULPSrr, X86::MOVAPSrr, X86::SUBPSrr, X86::MULPSrr, + X86::ADDPSrr, X86::MULPSrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + } + break; + } + DEBUG(dbgs() << "\nAlternate sequence for " << MF.getName() << "\n"; + for (unsigned i = 0; i < InsInstrs.size(); i++) { + dbgs() << i << ": "; + InsInstrs[i]->print(dbgs(), false, MF.getSubtarget().getInstrInfo()); + }); + DelInstrs.push_back(&Root); // Record FDiv for deletion +} + +/// Find instructions that can be turned into recip +static bool getFDIVPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + // TODO: should we support other kinds of instructions? + case X86::VDIVSSrr: // f32 + case X86::VDIVPSrr: // v4f32 + case X86::VDIVPSYrr: // v8f32 + case X86::DIVSSrr: // f32 + case X86::DIVPSrr: // v4f32 + break; + } + auto *MF = Root.getParent()->getParent(); + auto TLI = MF->getSubtarget().getTargetLowering(); + EVT VT = getFDivEVT(Root); + if (VT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; + switch (TLI->getRecipEstimateDivEnabled(VT, *MF)) { + case TLI->ReciprocalEstimate::Disabled: + return false; + case TLI->ReciprocalEstimate::Unspecified: + if (Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + break; + return false; + } + + Patterns.push_back(MachineCombinerPattern::Div2RecipEst); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getFDIVPatterns(Root, Patterns)) + return true; + // TODO: FSQRT patterns will be prepared after reciprocal implementation + // completes + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -30,9 +30,9 @@ define float @f32_one_step(float %x) #1 { ; SSE-LABEL: f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: addss %xmm2, %xmm1 @@ -41,12 +41,12 @@ ; ; AVX-LABEL: f32_one_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast float 1.0, %x ret float %div @@ -55,10 +55,10 @@ define float @f32_two_step(float %x) #2 { ; SSE-LABEL: f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subss %xmm3, %xmm4 ; SSE-NEXT: mulss %xmm2, %xmm4 @@ -72,16 +72,16 @@ ; ; AVX-LABEL: f32_two_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vmulss %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vaddss %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast float 1.0, %x ret float %div @@ -107,9 +107,9 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps %xmm2, %xmm1 @@ -118,12 +118,12 @@ ; ; AVX-LABEL: v4f32_one_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpps %xmm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vrcpps %xmm0, %xmm2 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -132,10 +132,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm2, %xmm4 @@ -149,16 +149,16 @@ ; ; AVX-LABEL: v4f32_two_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpps %xmm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vrcpps %xmm0, %xmm2 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vsubps %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vmulps %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -187,9 +187,9 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm4, %xmm3 @@ -205,12 +205,12 @@ ; ; AVX-LABEL: v8f32_one_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vrcpps %ymm0, %ymm2 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -220,10 +220,10 @@ ; SSE-LABEL: v8f32_two_step: ; SSE: # BB#0: ; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: rcpps %xmm0, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm3, %xmm5 @@ -249,16 +249,16 @@ ; ; AVX-LABEL: v8f32_two_step: ; AVX: # BB#0: -; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vrcpps %ymm0, %ymm2 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; AVX-NEXT: vsubps %ymm3, %ymm1, %ymm3 +; AVX-NEXT: vmulps %ymm2, %ymm3, %ymm3 +; AVX-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -24,8 +24,8 @@ ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: ; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: addss %xmm2, %xmm1 @@ -36,11 +36,11 @@ ; AVX-LABEL: f32_one_step_2: ; AVX: # BB#0: ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast float 3456.0, %x @@ -51,9 +51,9 @@ ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: ; SSE-NEXT: rcpss %xmm0, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subss %xmm3, %xmm4 ; SSE-NEXT: mulss %xmm2, %xmm4 @@ -69,15 +69,15 @@ ; AVX-LABEL: f32_two_step_2: ; AVX: # BB#0: ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm3 +; AVX-NEXT: vmulss %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vaddss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast float 6789.0, %x @@ -88,8 +88,8 @@ ; SSE-LABEL: v4f32_one_step2: ; SSE: # BB#0: ; SSE-NEXT: rcpps %xmm0, %xmm2 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps %xmm2, %xmm1 @@ -100,11 +100,11 @@ ; AVX-LABEL: v4f32_one_step2: ; AVX: # BB#0: ; AVX-NEXT: vrcpps %xmm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x @@ -115,9 +115,9 @@ ; SSE-LABEL: v4f32_two_step2: ; SSE: # BB#0: ; SSE-NEXT: rcpps %xmm0, %xmm2 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm2, %xmm4 @@ -133,15 +133,15 @@ ; AVX-LABEL: v4f32_two_step2: ; AVX: # BB#0: ; AVX-NEXT: vrcpps %xmm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vsubps %xmm3, %xmm2, %xmm3 +; AVX-NEXT: vmulps %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x @@ -151,32 +151,32 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: rcpps %xmm0, %xmm4 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm2 +; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm0, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: v8f32_one_step2: ; AVX: # BB#0: ; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x @@ -186,48 +186,48 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: rcpps %xmm0, %xmm3 +; SSE-NEXT: movaps ${{\.LCPI.*}}, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 ; SSE-NEXT: mulps %xmm3, %xmm5 ; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: subps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm5, %xmm3 ; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 +; SSE-NEXT: rcpps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 ; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm2, %xmm1 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v8f32_two_step2: ; AVX: # BB#0: ; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm3 +; AVX-NEXT: vsubps %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vmulps %ymm1, %ymm3, %ymm3 +; AVX-NEXT: vaddps %ymm1, %ymm3, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x @@ -252,9 +252,9 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq ;