Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -20,12 +20,14 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include #include #include @@ -112,6 +114,9 @@ typedef std::pair GOTEquivUsePair; MapVector GlobalGOTEquivs; + /// The taget schedule model. + TargetSchedModel SchedModel; + private: MCSymbol *CurrentFnBegin = nullptr; MCSymbol *CurrentFnEnd = nullptr; Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -60,7 +60,8 @@ /// otherwise easily derivable from the IR text. /// enum CommentFlag { - ReloadReuse = 0x1 // higher bits are reserved for target dep comments. + ReloadReuse = 0x1, // higher bits are reserved for target dep comments. + PrintSchedule = 0x2 }; enum MIFlag { Index: include/llvm/CodeGen/TargetSchedule.h =================================================================== --- include/llvm/CodeGen/TargetSchedule.h +++ include/llvm/CodeGen/TargetSchedule.h @@ -181,6 +181,8 @@ /// This is typically one cycle. unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *DepMI) const; + double computeInstrRThroughput(const MachineInstr *MI, + bool UseDefaultThroughput = true) const; }; } // end namespace llvm Index: include/llvm/MC/MCTargetOptions.h =================================================================== --- include/llvm/MC/MCTargetOptions.h +++ include/llvm/MC/MCTargetOptions.h @@ -47,6 +47,7 @@ bool ShowMCEncoding : 1; bool ShowMCInst : 1; bool AsmVerbose : 1; + bool PrintSchedule : 1; /// Preserve Comments in Assembly. bool PreserveAsmComments : 1; @@ -69,22 +70,15 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) { #define ARE_EQUAL(X) LHS.X == RHS.X - return (ARE_EQUAL(SanitizeAddress) && - ARE_EQUAL(MCRelaxAll) && - ARE_EQUAL(MCNoExecStack) && - ARE_EQUAL(MCFatalWarnings) && - ARE_EQUAL(MCNoWarn) && - ARE_EQUAL(MCNoDeprecatedWarn) && - ARE_EQUAL(MCSaveTempLabels) && - ARE_EQUAL(MCUseDwarfDirectory) && + return (ARE_EQUAL(SanitizeAddress) && ARE_EQUAL(MCRelaxAll) && + ARE_EQUAL(MCNoExecStack) && ARE_EQUAL(MCFatalWarnings) && + ARE_EQUAL(MCNoWarn) && ARE_EQUAL(MCNoDeprecatedWarn) && + ARE_EQUAL(MCSaveTempLabels) && ARE_EQUAL(MCUseDwarfDirectory) && ARE_EQUAL(MCIncrementalLinkerCompatible) && - ARE_EQUAL(MCPIECopyRelocations) && - ARE_EQUAL(ShowMCEncoding) && - ARE_EQUAL(ShowMCInst) && - ARE_EQUAL(AsmVerbose) && - ARE_EQUAL(DwarfVersion) && - ARE_EQUAL(ABIName) && - ARE_EQUAL(IASSearchPaths)); + ARE_EQUAL(MCPIECopyRelocations) && ARE_EQUAL(ShowMCEncoding) && + ARE_EQUAL(ShowMCInst) && ARE_EQUAL(AsmVerbose) && + ARE_EQUAL(PrintSchedule) && ARE_EQUAL(DwarfVersion) && + ARE_EQUAL(ABIName) && ARE_EQUAL(IASSearchPaths)); #undef ARE_EQUAL } Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/AsmPrinter.h" #include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" @@ -19,17 +20,16 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" @@ -46,6 +46,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -86,9 +87,9 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" @@ -719,7 +720,8 @@ } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(AsmPrinter *AP, const MachineInstr &MI, + raw_ostream &CommentOS) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -752,6 +754,20 @@ // Check for spill-induced copies if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) CommentOS << " Reload Reuse\n"; + + if (AP->TM.Options.MCOptions.PrintSchedule || + MI.getAsmPrinterFlag(MachineInstr::PrintSchedule)) { + if (!(MI.isPseudo() || MI.isTerminator())) { + auto Latency = AP->SchedModel.computeInstrLatency(&MI); + auto RThroughput = AP->SchedModel.computeInstrRThroughput(&MI); + if (Latency > 0 && RThroughput != std::numeric_limits::infinity()) + CommentOS << "[" << Latency << format(":%2.2f", RThroughput) << "]\n"; + else if (Latency > 0) + CommentOS << "[" << Latency << ":???]\n"; + else if (RThroughput != std::numeric_limits::infinity()) + CommentOS << "[???:" << RThroughput << "]\n"; + } + } } /// emitImplicitDef - This method emits the specified machine instruction @@ -965,7 +981,7 @@ } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(this, MI, OutStreamer->GetCommentOS()); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -1380,8 +1396,11 @@ } ORE = &getAnalysis().getORE(); - if (isVerbose()) + if (isVerbose()) { LI = &getAnalysis(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + SchedModel.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); + } } namespace { Index: lib/CodeGen/TargetSchedule.cpp =================================================================== --- lib/CodeGen/TargetSchedule.cpp +++ lib/CodeGen/TargetSchedule.cpp @@ -308,3 +308,48 @@ } return 0; } + +double +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI, + bool UseDefaultRThroughput) const { + if (UseDefaultRThroughput) { + // TODO: we should arrange something like we have on http://www.agner.org/ + } + double Unknown = std::numeric_limits::infinity(); + double Throughput = Unknown; + if (!hasInstrSchedModel() && !hasInstrItineraries()) + return Unknown; + + if (hasInstrItineraries()) { + unsigned schedClass = MI->getDesc().getSchedClass(); + auto *IID = getInstrItineraries(); + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value + return 1 / Throughput; + } + if (hasInstrSchedModel()) { + auto *SCDesc = resolveSchedClass(MI); + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Unknown; + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value + return 1 / Throughput; + } + return Unknown; +} Index: lib/MC/MCTargetOptions.cpp =================================================================== --- lib/MC/MCTargetOptions.cpp +++ lib/MC/MCTargetOptions.cpp @@ -18,7 +18,7 @@ MCSaveTempLabels(false), MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false), ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), - PreserveAsmComments(true) {} + PrintSchedule(false), PreserveAsmComments(true) {} StringRef MCTargetOptions::getABIName() const { return ABIName; Index: lib/Target/X86/InstPrinter/X86InstComments.h =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.h +++ lib/Target/X86/InstPrinter/X86InstComments.h @@ -17,14 +17,14 @@ namespace llvm { - enum AsmComments { - AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX. - }; +enum AsmComments { + AC_EVEX_2_VEX = 0x4 // For instr that was compressed from EVEX to VEX. +}; - class MCInst; - class raw_ostream; - bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, - const char *(*getRegName)(unsigned)); +class MCInst; +class raw_ostream; +bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)); } #endif Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -1,63 +1,124 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX ; It's the extra tests coverage for recip as discussed on D26855. +define float @f32_no_estimate_2(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: f32_no_estimate_2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: f32_no_estimate_2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: f32_no_estimate_2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # [5:1.00] +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [19:19.00] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: f32_no_estimate_2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: f32_no_estimate_2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: f32_no_estimate_2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; AVX512-NEXT: retq + %div = fdiv fast float 1234.0, %x + ret float %div +} + define float @f32_no_step_2(float %x) #3 { ; SSE-LABEL: f32_no_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm0 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm0 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 # [4:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_no_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_no_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_no_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_no_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_no_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # [1:0.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; AVX512-NEXT: retq %div = fdiv fast float 1234.0, %x ret float %div @@ -66,174 +127,264 @@ define float @f32_one_step_2(float %x) #1 { ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: addss %xmm2, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm1 # [1:???] +; SSE-NEXT: addss %xmm2, %xmm1 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # [5:1.00] +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x ret float %div } +define float @f32_no_estimate_2_divs(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2_divs: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: f32_no_estimate_2_divs: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: f32_no_estimate_2_divs: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: f32_no_estimate_2_divs: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # [5:1.00] +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [19:19.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: f32_no_estimate_2_divs: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: f32_no_estimate_2_divs: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2_divs: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: f32_no_estimate_2_divs: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:0.50] +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:1.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; AVX512-NEXT: retq + %div = fdiv fast float 3456.0, %x + %div2 = fdiv fast float %div, %x + ret float %div2 +} + define float @f32_one_step_2_divs(float %x) #1 { ; SSE-LABEL: f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: addss %xmm1, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addss %xmm1, %xmm2 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # [5:1.00] +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x @@ -243,210 +394,283 @@ define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subss %xmm3, %xmm4 -; SSE-NEXT: mulss %xmm2, %xmm4 -; SSE-NEXT: addss %xmm2, %xmm4 -; SSE-NEXT: mulss %xmm4, %xmm0 -; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm4, %xmm1 -; SSE-NEXT: addss %xmm4, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm3 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: subss %xmm3, %xmm4 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addss %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulss %xmm4, %xmm0 # [1:???] +; SSE-NEXT: subss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm4, %xmm1 # [1:???] +; SSE-NEXT: addss %xmm4, %xmm1 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_two_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:???] +; AVX-RECIP-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; FMA-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm3 # [5:1.00] +; BTVER2-NEXT: # xmm3 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [2:1.00] +; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:1.00] +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [2:1.00] +; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [5:1.00] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:0.50] +; SANDY-NEXT: # xmm3 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:1.00] +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [5:1.00] +; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # [1:1.00] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:0.00] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:0.00] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm3 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00] +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:0.50] +; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovaps %xmm1, %xmm3 # [1:1.00] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:0.00] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:0.00] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:0.00] +; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:0.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; AVX512-NEXT: retq %div = fdiv fast float 6789.0, %x ret float %div } +define <4 x float> @v4f32_no_estimate2(<4 x float> %x) #0 { +; SSE-LABEL: v4f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: v4f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: v4f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: v4f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [5:1.00] +; BTVER2-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [19:19.00] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: v4f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:0.50] +; SANDY-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:1.00] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: v4f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: v4f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:1.00] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: v4f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:0.50] +; AVX512-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:1.00] +; AVX512-NEXT: retq + %div = fdiv fast <4 x float> , %x + ret <4 x float> %div +} + define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: subps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [5:1.00] +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:0.50] +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:0.00] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:0.00] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -455,101 +679,106 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: subps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [5:1.00] +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:0.50] +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:0.00] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:0.00] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50] +; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x @@ -559,230 +788,304 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subps %xmm3, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm4, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm3 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: subps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: addps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_two_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [4:???] +; AVX-RECIP-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; FMA-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [5:1.00] +; BTVER2-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [2:1.00] +; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:1.00] +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [2:1.00] +; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [4:0.50] +; SANDY-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:1.00] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [5:1.00] +; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:1.00] +; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # [1:1.00] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; SKX-NEXT: vmovaps %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:0.00] +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # [1:1.00] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div } +define <8 x float> @v8f32_no_estimate2(<8 x float> %x) #0 { +; SSE-LABEL: v8f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm2 # [10:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: # xmm3 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: divps %xmm1, %xmm3 # [10:???] +; SSE-NEXT: movaps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: v8f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; AVX-RECIP-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: v8f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; FMA-RECIP-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: v8f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [5:1.00] +; BTVER2-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:19.00] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: v8f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:0.50] +; SANDY-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [12:1.00] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: v8f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:0.50] +; HASWELL-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:2.00] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: v8f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:0.50] +; HASWELL-NO-FMA-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:2.00] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: v8f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:0.50] +; AVX512-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:2.00] +; AVX512-NEXT: retq + %div = fdiv fast <8 x float> , %x + ret <8 x float> %div +} + define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 -; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm2, %xmm3 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm4, %xmm3 # [1:???] +; SSE-NEXT: rcpps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: movaps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; AVX-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [5:1.00] +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:0.50] +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:0.00] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:0.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -791,110 +1094,116 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subps %xmm0, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: rcpps %xmm1, %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: mulps %xmm3, %xmm1 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; AVX-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:???] +; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [5:1.00] +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [7:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:0.50] +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00] +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00] +; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:0.00] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:0.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00] +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x @@ -904,142 +1213,147 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm0, %xmm5 # [1:???] +; SSE-NEXT: subps %xmm4, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm5 # [1:???] +; SSE-NEXT: addps %xmm3, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm1 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm5, %xmm3 # [1:???] +; SSE-NEXT: rcpps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: movaps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm5 # [1:???] +; SSE-NEXT: subps %xmm4, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm5 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm2 # [1:???] +; SSE-NEXT: subps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm0 # [1:???] +; SSE-NEXT: addps %xmm5, %xmm0 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_two_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [4:???] +; AVX-RECIP-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; FMA-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [5:1.00] +; BTVER2-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [2:1.00] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:1.00] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [2:1.00] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [5:1.00] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [4:0.50] +; SANDY-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:1.00] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [5:1.00] +; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00] +; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; KNL-NEXT: vmovaps %ymm1, %ymm3 # [1:1.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00] +; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; SKX-NEXT: vmovaps %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:0.00] +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm3 # [1:1.00] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1048,48 +1362,48 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm0 -; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm0 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # [2:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_no_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # [5:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_no_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # [1:0.00] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1098,58 +1412,58 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm1 -; SSE-NEXT: rcpps %xmm0, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm1 # [1:???] +; SSE-NEXT: rcpps %xmm0, %xmm0 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # [2:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_no_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm0 # [7:2.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # [1:0.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div Index: test/CodeGen/X86/recip-pic.ll =================================================================== --- test/CodeGen/X86/recip-pic.ll +++ test/CodeGen/X86/recip-pic.ll @@ -1,27 +1,110 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic -print-schedule | FileCheck %s --check-prefix=CHECK -define fastcc float @foo(float %x) unnamed_addr #0 { -; CHECK-LABEL: foo: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: calll .L0$pb +define float @f32_no_estimate_2(float %x) #0 { +; CHECK-LABEL: f32_no_estimate_2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:1.00] ; CHECK-NEXT: .Lcfi0: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: calll .L0$pb # [1:0.00] +; CHECK-NEXT: .Lcfi1: ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: ; CHECK-NEXT: popl %eax -; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .Lcfi2: ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: # [1:0.50] ; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: divss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: movss %xmm1, (%eax) +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm0 # [3:1.00] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: divss {{[0-9]+}}(%esp), %xmm0 # [37:34.00] +; CHECK-NEXT: movss %xmm0, (%eax) # [1:1.00] +; CHECK-NEXT: movss %xmm0, (%esp) # [1:1.00] +; CHECK-NEXT: flds (%esp) # [3:1.00] +; CHECK-NEXT: popl %eax # [3:1.00] ; CHECK-NEXT: retl -entry: %div = fdiv fast float 3.0, %x store float %div, float* undef, align 4 ret float %div } +define float @f32_one_step(float %x) #1 { +; CHECK-LABEL: f32_one_step: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:1.00] +; CHECK-NEXT: .Lcfi3: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # [3:1.00] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L1$pb # [1:0.00] +; CHECK-NEXT: .Lcfi4: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L1$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi5: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp1: # [1:0.50] +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [3:1.00] +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # [5:1.00] +; CHECK-NEXT: mulss %xmm1, %xmm0 # [5:2.00] +; CHECK-NEXT: subss %xmm0, %xmm2 # [3:1.00] +; CHECK-NEXT: mulss %xmm1, %xmm2 # [5:2.00] +; CHECK-NEXT: addss %xmm1, %xmm2 # [3:1.00] +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [8:2.00] +; CHECK-NEXT: movss %xmm2, (%eax) # [1:1.00] +; CHECK-NEXT: movss %xmm2, (%esp) # [1:1.00] +; CHECK-NEXT: flds (%esp) # [3:1.00] +; CHECK-NEXT: popl %eax # [3:1.00] +; CHECK-NEXT: retl + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} + +define float @f32_two_steps(float %x) #2 { +; CHECK-LABEL: f32_two_steps: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:1.00] +; CHECK-NEXT: .Lcfi6: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # [3:1.00] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L2$pb # [1:0.00] +; CHECK-NEXT: .Lcfi7: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L2$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi8: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp2: # [1:0.50] +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [3:1.00] +; CHECK-NEXT: # xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # [5:1.00] +; CHECK-NEXT: movaps %xmm0, %xmm2 # [1:1.00] +; CHECK-NEXT: movaps %xmm3, %xmm4 # [1:1.00] +; CHECK-NEXT: mulss %xmm1, %xmm2 # [5:2.00] +; CHECK-NEXT: subss %xmm2, %xmm4 # [3:1.00] +; CHECK-NEXT: mulss %xmm1, %xmm4 # [5:2.00] +; CHECK-NEXT: addss %xmm1, %xmm4 # [3:1.00] +; CHECK-NEXT: mulss %xmm4, %xmm0 # [5:2.00] +; CHECK-NEXT: subss %xmm0, %xmm3 # [3:1.00] +; CHECK-NEXT: mulss %xmm4, %xmm3 # [5:2.00] +; CHECK-NEXT: addss %xmm4, %xmm3 # [3:1.00] +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [8:2.00] +; CHECK-NEXT: movss %xmm3, (%eax) # [1:1.00] +; CHECK-NEXT: movss %xmm3, (%esp) # [1:1.00] +; CHECK-NEXT: flds (%esp) # [3:1.00] +; CHECK-NEXT: popl %eax # [3:1.00] +; CHECK-NEXT: retl + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} +attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } +attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } +attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } Index: tools/llc/llc.cpp =================================================================== --- tools/llc/llc.cpp +++ tools/llc/llc.cpp @@ -99,6 +99,10 @@ static cl::opt ShowMCEncoding("show-mc-encoding", cl::Hidden, cl::desc("Show encoding in .s output")); +static cl::opt + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print [latency:recip throughput] of instructions in .s output")); + static cl::opt EnableDwarfDirectory( "enable-dwarf-directory", cl::Hidden, cl::desc("Use .file directives with an explicit directory.")); @@ -446,6 +450,7 @@ TargetOptions Options = InitTargetOptionsFromCodeGenFlags(); Options.DisableIntegratedAS = NoIntegratedAssembler; Options.MCOptions.ShowMCEncoding = ShowMCEncoding; + Options.MCOptions.PrintSchedule = PrintSchedule; Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory; Options.MCOptions.AsmVerbose = AsmVerbose; Options.MCOptions.PreserveAsmComments = PreserveComments;