Index: include/llvm/MC/MCTargetOptions.h =================================================================== --- include/llvm/MC/MCTargetOptions.h +++ include/llvm/MC/MCTargetOptions.h @@ -47,6 +47,7 @@ bool ShowMCEncoding : 1; bool ShowMCInst : 1; bool AsmVerbose : 1; + bool PrintLatency : 1; /// Preserve Comments in Assembly. bool PreserveAsmComments : 1; @@ -82,6 +83,7 @@ ARE_EQUAL(ShowMCEncoding) && ARE_EQUAL(ShowMCInst) && ARE_EQUAL(AsmVerbose) && + ARE_EQUAL(PrintLatency) && ARE_EQUAL(DwarfVersion) && ARE_EQUAL(ABIName) && ARE_EQUAL(IASSearchPaths)); Index: lib/MC/MCTargetOptions.cpp =================================================================== --- lib/MC/MCTargetOptions.cpp +++ lib/MC/MCTargetOptions.cpp @@ -18,6 +18,7 @@ MCSaveTempLabels(false), MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false), ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), + PrintLatency(false), PreserveAsmComments(true) {} StringRef MCTargetOptions::getABIName() const { Index: lib/Target/X86/InstPrinter/X86InstComments.h =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.h +++ lib/Target/X86/InstPrinter/X86InstComments.h @@ -18,7 +18,8 @@ namespace llvm { enum AsmComments { - AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX. + AC_EVEX_2_VEX = 0x2, // For instr that was compressed from EVEX to VEX. + PRINT_LATENCY = 0x4 }; class MCInst; Index: lib/Target/X86/X86MCInstLower.cpp =================================================================== --- lib/Target/X86/X86MCInstLower.cpp +++ lib/Target/X86/X86MCInstLower.cpp @@ -1255,6 +1255,70 @@ return Comment; } +/// \brief Gets latency information for \p Inst from the itinerary +/// scheduling model. +/// \return The maximum expected latency over all the operands or -1 +/// if no information is available. +static int getItineraryLatency(const MachineInstr *Inst) { + const int NoInformationAvailable = -1; + + auto *MF = Inst->getParent()->getParent(); + auto &STI = MF->getSubtarget(); + // Check if we have a CPU to get the itinerary information. + if (STI.getCPU().empty()) + return NoInformationAvailable; + + // Get itinerary information. + InstrItineraryData IID = STI.getInstrItineraryForCPU(STI.getCPU()); + // Get the scheduling class of the requested instruction. + const MCInstrDesc& Desc = STI.getInstrInfo()->get(Inst->getOpcode()); + unsigned SCClass = Desc.getSchedClass(); + + int Latency = 0; + for (unsigned OpIdx = 0, OpIdxEnd = Inst->getNumOperands(); OpIdx != OpIdxEnd; + ++OpIdx) + Latency = std::max(Latency, IID.getOperandCycle(SCClass, OpIdx)); + + return Latency; +} + +/// \brief Gets latency information for \p Inst +/// \return The maximum expected latency over all the definitions or -1 +/// if no information is available. +static int getLatency(const MachineInstr *Inst) { + // Try to compute scheduling information. + auto &STI = Inst->getParent()->getParent()->getSubtarget(); + auto &SCModel = STI.getSchedModel(); + const int NoInformationAvailable = -1; + + // Check if we have a scheduling model for instructions. + if (!SCModel.hasInstrSchedModel()) + // Try to fall back to the itinerary model if the scheduling model doesn't + // have a scheduling table. Note the default does not have a table. + return getItineraryLatency(Inst); + + // Get the scheduling class of the requested instruction. + const MCInstrDesc& Desc = STI.getInstrInfo()->get(Inst->getOpcode()); + unsigned SCClass = Desc.getSchedClass(); + const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass); + // Resolving the variant SchedClass requires an MI to pass to + // SubTargetInfo::resolveSchedClass. + if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant()) + return NoInformationAvailable; + + // Compute output latency. + int Latency = 0; + for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries; + DefIdx != DefEnd; ++DefIdx) { + // Lookup the definition's write latency in SubtargetInfo. + const MCWriteLatencyEntry *WLEntry = STI.getWriteLatencyEntry(SCDesc, + DefIdx); + Latency = std::max(Latency, WLEntry->Cycles); + } + + return Latency; +} + void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget().getRegisterInfo(); @@ -1266,6 +1330,15 @@ OutStreamer->AddComment("EVEX TO VEX Compression ", false); } + if (TM.Options.MCOptions.PrintLatency || + MI->getAsmPrinterFlags() & PRINT_LATENCY) { + if (MI->isPseudo() || MI->isTerminator()) + return; + auto Latency = getLatency(MI); + if (Latency > 0) + OutStreamer->AddComment(" Latency = " + Twine(Latency)); + } + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -1,64 +1,106 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX ; It's the extra tests coverage for recip as discussed on D26855. +define float @f32_no_estimate_2(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; +; AVX-RECIP-LABEL: f32_no_estimate_2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; +; FMA-RECIP-LABEL: f32_no_estimate_2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; +; BTVER2-LABEL: f32_no_estimate_2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 5 +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 19 +; +; SANDY-LABEL: f32_no_estimate_2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; HASWELL-LABEL: f32_no_estimate_2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; AVX512-LABEL: f32_no_estimate_2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 + %div = fdiv fast float 1234.0, %x + ret float %div +} + define float @f32_no_step_2(float %x) #3 { ; SSE-LABEL: f32_no_step_2: ; SSE: # BB#0: ; SSE-NEXT: rcpss %xmm0, %xmm0 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_no_step_2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_no_step_2: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_no_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 7 ; ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: f32_no_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; AVX512-LABEL: f32_no_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 %div = fdiv fast float 1234.0, %x ret float %div } @@ -74,7 +116,6 @@ ; SSE-NEXT: addss %xmm2, %xmm1 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # BB#0: @@ -85,7 +126,6 @@ ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # BB#0: @@ -93,60 +133,124 @@ ; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 5 +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 7 ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 %div = fdiv fast float 3456.0, %x ret float %div } +define float @f32_no_estimate_2_divs(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2_divs: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 +; +; AVX-RECIP-LABEL: f32_no_estimate_2_divs: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; +; FMA-RECIP-LABEL: f32_no_estimate_2_divs: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 +; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; +; BTVER2-LABEL: f32_no_estimate_2_divs: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 5 +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 19 +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 7 +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 2 +; +; SANDY-LABEL: f32_no_estimate_2_divs: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; +; HASWELL-LABEL: f32_no_estimate_2_divs: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2_divs: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; +; AVX512-LABEL: f32_no_estimate_2_divs: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # Latency = 4 +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # Latency = 12 +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 + %div = fdiv fast float 3456.0, %x + %div2 = fdiv fast float %div, %x + ret float %div2 +} + define float @f32_one_step_2_divs(float %x) #1 { ; SSE-LABEL: f32_one_step_2_divs: ; SSE: # BB#0: @@ -159,7 +263,6 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2_divs: ; AVX-RECIP: # BB#0: @@ -171,7 +274,6 @@ ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2_divs: ; FMA-RECIP: # BB#0: @@ -180,61 +282,58 @@ ; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 5 +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 7 +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 2 ; ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x ret float %div2 @@ -257,7 +356,6 @@ ; SSE-NEXT: addss %xmm4, %xmm1 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_two_step_2: ; AVX-RECIP: # BB#0: @@ -272,7 +370,6 @@ ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step_2: ; FMA-RECIP: # BB#0: @@ -284,80 +381,129 @@ ; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm3 # Latency = 5 +; BTVER2-NEXT: # xmm3 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # Latency = 2 +; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # Latency = 3 +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # Latency = 2 +; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # Latency = 3 +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 7 ; ; SANDY-LABEL: f32_two_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # Latency = 5 +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm3 # Latency = 4 +; SANDY-NEXT: # xmm3 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # Latency = 3 +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # Latency = 5 +; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # Latency = 3 +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # Latency = 1 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm3 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm3 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm2 # Latency = 4 +; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovaps %xmm1, %xmm3 # Latency = 1 ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 %div = fdiv fast float 6789.0, %x ret float %div } +define <4 x float> @v4f32_no_estimate2(<4 x float> %x) #0 { +; SSE-LABEL: v4f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; +; AVX-RECIP-LABEL: v4f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; +; FMA-RECIP-LABEL: v4f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; +; BTVER2-LABEL: v4f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm1 # Latency = 5 +; BTVER2-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # Latency = 19 +; +; SANDY-LABEL: v4f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm1 # Latency = 4 +; SANDY-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; HASWELL-LABEL: v4f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; HASWELL-NO-FMA-LABEL: v4f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %xmm1 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 # Latency = 12 +; +; AVX512-LABEL: v4f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm1 # Latency = 4 +; AVX512-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # Latency = 12 + %div = fdiv fast <4 x float> , %x + ret <4 x float> %div +} + define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step2: ; SSE: # BB#0: @@ -369,7 +515,6 @@ ; SSE-NEXT: addps %xmm2, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # BB#0: @@ -380,7 +525,6 @@ ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # BB#0: @@ -388,66 +532,61 @@ ; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # Latency = 5 +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 7 ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # Latency = 4 +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -464,7 +603,6 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_2_divs: ; AVX-RECIP: # BB#0: @@ -476,7 +614,6 @@ ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_2_divs: ; FMA-RECIP: # BB#0: @@ -485,72 +622,67 @@ ; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # Latency = 5 +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 7 +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 2 ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # Latency = 4 +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # Latency = 9 +; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x ret <4 x float> %div2 @@ -573,7 +705,6 @@ ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_two_step2: ; AVX-RECIP: # BB#0: @@ -588,7 +719,6 @@ ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # BB#0: @@ -600,92 +730,140 @@ ; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm3 # Latency = 5 +; BTVER2-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # Latency = 2 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # Latency = 2 +; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # Latency = 3 +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # Latency = 2 +; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # Latency = 3 +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 2 +; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 2 +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 7 ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm3 # Latency = 4 +; SANDY-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # Latency = 3 +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # Latency = 5 +; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # Latency = 3 +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # Latency = 1 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # Latency = 4 +; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm3 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # Latency = 5 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 +; KNL-NEXT: vmovaps %xmm1, %xmm3 # Latency = 1 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; SKX-NEXT: vmovaps %xmm1, %xmm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # Latency = 4 +; SKX-NEXT: vmovaps %xmm1, %xmm3 # Latency = 1 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # Latency = 9 %div = fdiv fast <4 x float> , %x ret <4 x float> %div } +define <8 x float> @v8f32_no_estimate2(<8 x float> %x) #0 { +; SSE-LABEL: v8f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: divps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; +; AVX-RECIP-LABEL: v8f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; +; FMA-RECIP-LABEL: v8f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; +; BTVER2-LABEL: v8f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm1 # Latency = 5 +; BTVER2-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # Latency = 19 +; +; SANDY-LABEL: v8f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm1 # Latency = 4 +; SANDY-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # Latency = 12 +; +; HASWELL-LABEL: v8f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %ymm1 # Latency = 4 +; HASWELL-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # Latency = 19 +; +; HASWELL-NO-FMA-LABEL: v8f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %ymm1 # Latency = 4 +; HASWELL-NO-FMA-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 # Latency = 19 +; +; AVX512-LABEL: v8f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %ymm1 # Latency = 4 +; AVX512-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # Latency = 19 + %div = fdiv fast <8 x float> , %x + ret <8 x float> %div +} + define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # BB#0: @@ -705,7 +883,6 @@ ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # BB#0: @@ -716,7 +893,6 @@ ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # BB#0: @@ -724,66 +900,61 @@ ; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 ; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # Latency = 5 +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # Latency = 2 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 2 +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 2 +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 7 ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # Latency = 5 +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # Latency = 4 +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -809,7 +980,6 @@ ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step_2_divs: ; AVX-RECIP: # BB#0: @@ -821,7 +991,6 @@ ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step_2_divs: ; FMA-RECIP: # BB#0: @@ -830,72 +999,67 @@ ; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # Latency = 5 +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # Latency = 2 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 2 +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 2 +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 7 +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 2 ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # Latency = 5 +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # Latency = 4 +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 9 +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 9 +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 9 +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 ; ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 9 +; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # Latency = 9 +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x ret <8 x float> %div2 @@ -932,7 +1096,6 @@ ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_two_step2: ; AVX-RECIP: # BB#0: @@ -947,7 +1110,6 @@ ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # BB#0: @@ -959,88 +1121,83 @@ ; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm3 # Latency = 5 +; BTVER2-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # Latency = 2 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # Latency = 2 +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # Latency = 3 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # Latency = 2 +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # Latency = 3 +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 2 +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 2 +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 7 ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # Latency = 5 +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # Latency = 5 +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm3 # Latency = 4 +; SANDY-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # Latency = 3 +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # Latency = 5 +; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # Latency = 3 +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 +; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # Latency = 1 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # Latency = 5 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # Latency = 3 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm3 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # Latency = 7 +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 +; KNL-NEXT: vmovaps %ymm1, %ymm3 # Latency = 1 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; SKX-NEXT: vmovaps %ymm1, %ymm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # Latency = 5 +; SKX-NEXT: vmovaps %ymm1, %ymm3 # Latency = 1 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1050,47 +1207,38 @@ ; SSE: # BB#0: ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: rcpps %xmm1, %xmm1 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # Latency = 2 ; ; SANDY-LABEL: v8f32_no_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # Latency = 5 ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 ; ; KNL-LABEL: v8f32_no_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1102,55 +1250,46 @@ ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step2: ; AVX-RECIP: # BB#0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step2: ; FMA-RECIP: # BB#0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # Latency = 2 +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 7 ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # Latency = 5 +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; KNL-LABEL: v8f32_no_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vrcpps %ymm0, %ymm0 # Latency = 7 +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # Latency = 9 %div = fdiv fast <8 x float> , %x ret <8 x float> %div } Index: test/CodeGen/X86/recip-pic.ll =================================================================== --- test/CodeGen/X86/recip-pic.ll +++ test/CodeGen/X86/recip-pic.ll @@ -1,27 +1,107 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic -print-latency | FileCheck %s --check-prefix=CHECK -define fastcc float @foo(float %x) unnamed_addr #0 { -; CHECK-LABEL: foo: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: calll .L0$pb +define float @f32_no_estimate_2(float %x) #0 { +; CHECK-LABEL: f32_no_estimate_2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # Latency = 1 ; CHECK-NEXT: .Lcfi0: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: calll .L0$pb +; CHECK-NEXT: .Lcfi1: ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: ; CHECK-NEXT: popl %eax -; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .Lcfi2: ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: # Latency = 1 ; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: divss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: movss %xmm1, (%eax) -; CHECK-NEXT: retl -entry: +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm0 # Latency = 3 +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: divss {{[0-9]+}}(%esp), %xmm0 # Latency = 37 +; CHECK-NEXT: movss %xmm0, (%eax) # Latency = 1 +; CHECK-NEXT: movss %xmm0, (%esp) # Latency = 1 +; CHECK-NEXT: flds (%esp) # Latency = 3 +; CHECK-NEXT: popl %eax # Latency = 3 %div = fdiv fast float 3.0, %x store float %div, float* undef, align 4 ret float %div } +define float @f32_one_step(float %x) #1 { +; CHECK-LABEL: f32_one_step: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # Latency = 1 +; CHECK-NEXT: .Lcfi3: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # Latency = 3 +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L1$pb +; CHECK-NEXT: .Lcfi4: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L1$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi5: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp1: # Latency = 1 +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # Latency = 3 +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # Latency = 5 +; CHECK-NEXT: mulss %xmm1, %xmm0 # Latency = 5 +; CHECK-NEXT: subss %xmm0, %xmm2 # Latency = 3 +; CHECK-NEXT: mulss %xmm1, %xmm2 # Latency = 5 +; CHECK-NEXT: addss %xmm1, %xmm2 # Latency = 3 +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # Latency = 8 +; CHECK-NEXT: movss %xmm2, (%eax) # Latency = 1 +; CHECK-NEXT: movss %xmm2, (%esp) # Latency = 1 +; CHECK-NEXT: flds (%esp) # Latency = 3 +; CHECK-NEXT: popl %eax # Latency = 3 + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} + +define float @f32_two_steps(float %x) #2 { +; CHECK-LABEL: f32_two_steps: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # Latency = 1 +; CHECK-NEXT: .Lcfi6: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # Latency = 3 +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L2$pb +; CHECK-NEXT: .Lcfi7: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L2$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi8: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp2: # Latency = 1 +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # Latency = 3 +; CHECK-NEXT: # xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # Latency = 5 +; CHECK-NEXT: movaps %xmm0, %xmm2 # Latency = 1 +; CHECK-NEXT: movaps %xmm3, %xmm4 # Latency = 1 +; CHECK-NEXT: mulss %xmm1, %xmm2 # Latency = 5 +; CHECK-NEXT: subss %xmm2, %xmm4 # Latency = 3 +; CHECK-NEXT: mulss %xmm1, %xmm4 # Latency = 5 +; CHECK-NEXT: addss %xmm1, %xmm4 # Latency = 3 +; CHECK-NEXT: mulss %xmm4, %xmm0 # Latency = 5 +; CHECK-NEXT: subss %xmm0, %xmm3 # Latency = 3 +; CHECK-NEXT: mulss %xmm4, %xmm3 # Latency = 5 +; CHECK-NEXT: addss %xmm4, %xmm3 # Latency = 3 +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # Latency = 8 +; CHECK-NEXT: movss %xmm3, (%eax) # Latency = 1 +; CHECK-NEXT: movss %xmm3, (%esp) # Latency = 1 +; CHECK-NEXT: flds (%esp) # Latency = 3 +; CHECK-NEXT: popl %eax # Latency = 3 + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} +attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } +attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } +attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } Index: tools/llc/llc.cpp =================================================================== --- tools/llc/llc.cpp +++ tools/llc/llc.cpp @@ -99,6 +99,10 @@ static cl::opt ShowMCEncoding("show-mc-encoding", cl::Hidden, cl::desc("Show encoding in .s output")); +static cl::opt PrintLatency("print-latency", cl::Hidden, + cl::init(false), + cl::desc("Print latency of instructions in .s output")); + static cl::opt EnableDwarfDirectory( "enable-dwarf-directory", cl::Hidden, cl::desc("Use .file directives with an explicit directory.")); @@ -446,6 +450,7 @@ TargetOptions Options = InitTargetOptionsFromCodeGenFlags(); Options.DisableIntegratedAS = NoIntegratedAssembler; Options.MCOptions.ShowMCEncoding = ShowMCEncoding; + Options.MCOptions.PrintLatency = PrintLatency; Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory; Options.MCOptions.AsmVerbose = AsmVerbose; Options.MCOptions.PreserveAsmComments = PreserveComments;