Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -22,10 +22,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include #include #include @@ -112,6 +114,9 @@ typedef std::pair GOTEquivUsePair; MapVector GlobalGOTEquivs; + /// The taget schedule model. + TargetSchedModel SchedModel; + private: MCSymbol *CurrentFnBegin = nullptr; MCSymbol *CurrentFnEnd = nullptr; Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -60,7 +60,8 @@ /// otherwise easily derivable from the IR text. /// enum CommentFlag { - ReloadReuse = 0x1 // higher bits are reserved for target dep comments. + ReloadReuse = 0x1, // higher bits are reserved for target dep comments. + PrintLatency = 0x2 }; enum MIFlag { Index: include/llvm/MC/MCTargetOptions.h =================================================================== --- include/llvm/MC/MCTargetOptions.h +++ include/llvm/MC/MCTargetOptions.h @@ -47,6 +47,7 @@ bool ShowMCEncoding : 1; bool ShowMCInst : 1; bool AsmVerbose : 1; + bool PrintLatency : 1; /// Preserve Comments in Assembly. bool PreserveAsmComments : 1; @@ -82,6 +83,7 @@ ARE_EQUAL(ShowMCEncoding) && ARE_EQUAL(ShowMCInst) && ARE_EQUAL(AsmVerbose) && + ARE_EQUAL(PrintLatency) && ARE_EQUAL(DwarfVersion) && ARE_EQUAL(ABIName) && ARE_EQUAL(IASSearchPaths)); Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -46,6 +46,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -719,7 +720,8 @@ } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(AsmPrinter *AP, const MachineInstr &MI, + raw_ostream &CommentOS) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -752,6 +754,15 @@ // Check for spill-induced copies if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) CommentOS << " Reload Reuse\n"; + + if (AP->TM.Options.MCOptions.PrintLatency || + MI.getAsmPrinterFlag(MachineInstr::PrintLatency)) { + if (!(MI.isPseudo() || MI.isTerminator())) { + auto Latency = AP->SchedModel.computeInstrLatency(&MI); + if (Latency > 0) + CommentOS << " [" << Latency << ":???]\n"; + } + } } /// emitImplicitDef - This method emits the specified machine instruction @@ -965,7 +976,7 @@ } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(this, MI, OutStreamer->GetCommentOS()); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -1380,8 +1391,11 @@ } ORE = &getAnalysis().getORE(); - if (isVerbose()) + if (isVerbose()) { LI = &getAnalysis(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + SchedModel.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); + } } namespace { Index: lib/MC/MCTargetOptions.cpp =================================================================== --- lib/MC/MCTargetOptions.cpp +++ lib/MC/MCTargetOptions.cpp @@ -18,6 +18,7 @@ MCSaveTempLabels(false), MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false), ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), + PrintLatency(false), PreserveAsmComments(true) {} StringRef MCTargetOptions::getABIName() const { Index: lib/Target/X86/InstPrinter/X86InstComments.h =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.h +++ lib/Target/X86/InstPrinter/X86InstComments.h @@ -18,7 +18,7 @@ namespace llvm { enum AsmComments { - AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX. + AC_EVEX_2_VEX = 0x4 // For instr that was compressed from EVEX to VEX. }; class MCInst; Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -1,63 +1,124 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-latency -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX ; It's the extra tests coverage for recip as discussed on D26855. +define float @f32_no_estimate_2(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: f32_no_estimate_2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: f32_no_estimate_2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: f32_no_estimate_2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # [5:???] +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [19:???] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: f32_no_estimate_2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: f32_no_estimate_2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: f32_no_estimate_2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; AVX512-NEXT: retq + %div = fdiv fast float 1234.0, %x + ret float %div +} + define float @f32_no_step_2(float %x) #3 { ; SSE-LABEL: f32_no_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm0 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm0 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 # [4:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_no_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_no_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_no_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:???] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_no_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_no_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # [1:???] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; AVX512-NEXT: retq %div = fdiv fast float 1234.0, %x ret float %div @@ -66,174 +127,264 @@ define float @f32_one_step_2(float %x) #1 { ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: addss %xmm2, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm1 # [1:???] +; SSE-NEXT: addss %xmm2, %xmm1 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # [5:???] +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x ret float %div } +define float @f32_no_estimate_2_divs(float %x) #0 { +; SSE-LABEL: f32_no_estimate_2_divs: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: f32_no_estimate_2_divs: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: f32_no_estimate_2_divs: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: f32_no_estimate_2_divs: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm1 # [5:???] +; BTVER2-NEXT: # xmm1 = mem[0],zero,zero,zero +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [19:???] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:???] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: f32_no_estimate_2_divs: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; SANDY-NEXT: # xmm1 = mem[0],zero,zero,zero +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: f32_no_estimate_2_divs: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate_2_divs: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: f32_no_estimate_2_divs: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm1 # [4:???] +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # [12:???] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; AVX512-NEXT: retq + %div = fdiv fast float 3456.0, %x + %div2 = fdiv fast float %div, %x + ret float %div2 +} + define float @f32_one_step_2_divs(float %x) #1 { ; SSE-LABEL: f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: addss %xmm1, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: mulss %xmm2, %xmm0 -; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm0 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulss %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addss %xmm1, %xmm2 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm2 # [5:???] +; BTVER2-NEXT: # xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:???] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; SANDY-NEXT: # xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:???] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x @@ -243,210 +394,283 @@ define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subss %xmm3, %xmm4 -; SSE-NEXT: mulss %xmm2, %xmm4 -; SSE-NEXT: addss %xmm2, %xmm4 -; SSE-NEXT: mulss %xmm4, %xmm0 -; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm4, %xmm1 -; SSE-NEXT: addss %xmm4, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpss %xmm0, %xmm2 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm3 # [1:???] +; SSE-NEXT: movss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: subss %xmm3, %xmm4 # [1:???] +; SSE-NEXT: mulss %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addss %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulss %xmm4, %xmm0 # [1:???] +; SSE-NEXT: subss %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulss %xmm4, %xmm1 # [1:???] +; SSE-NEXT: addss %xmm4, %xmm1 # [1:???] +; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_two_step_2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:???] +; AVX-RECIP-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step_2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; FMA-RECIP-NEXT: # xmm2 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovss {{.*}}(%rip), %xmm3 # [5:???] +; BTVER2-NEXT: # xmm3 = mem[0],zero,zero,zero +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [2:???] +; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:???] +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [2:???] +; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:???] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [5:???] +; SANDY-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:???] +; SANDY-NEXT: # xmm3 = mem[0],zero,zero,zero +; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:???] +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [5:???] +; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:???] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:???] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NEXT: # xmm2 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:???] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:???] +; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:???] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vmovss {{.*}}(%rip), %xmm3 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm3 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # [1:???] +; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm2 # [4:???] +; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:???] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:???] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:???] +; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:???] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; AVX512-NEXT: retq %div = fdiv fast float 6789.0, %x ret float %div } +define <4 x float> @v4f32_no_estimate2(<4 x float> %x) #0 { +; SSE-LABEL: v4f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm1 # [10:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: v4f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; AVX-RECIP-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: v4f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; FMA-RECIP-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: v4f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [5:???] +; BTVER2-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [19:???] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: v4f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; SANDY-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:???] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: v4f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: v4f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; HASWELL-NO-FMA-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:???] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: v4f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm1 # [4:???] +; AVX512-NEXT: # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # [12:???] +; AVX512-NEXT: retq + %div = fdiv fast <4 x float> , %x + ret <4 x float> %div +} + define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: subps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [5:???] +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:???] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:???] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:???] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -455,101 +679,106 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: subps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; AVX-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:???] +; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [5:???] +; BTVER2-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [7:???] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; SANDY-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:???] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:???] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:???] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:???] +; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x @@ -559,230 +788,304 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subps %xmm3, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm4, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm3 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: subps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: addps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: movaps %xmm1, %xmm0 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_two_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [4:???] +; AVX-RECIP-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 # [1:???] +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %xmm2 # [4:???] +; FMA-RECIP-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [5:???] +; BTVER2-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # [2:???] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [2:???] +; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:???] +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [2:???] +; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:???] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [2:???] +; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [2:???] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %xmm3 # [4:???] +; SANDY-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:???] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [5:???] +; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:???] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:???] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:???] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # [4:???] +; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # [5:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:???] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:???] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:???] +; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; SKX-NEXT: vmovaps %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # [1:???] +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # [4:???] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # [1:???] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:???] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:???] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:???] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:???] ; SKX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div } +define <8 x float> @v8f32_no_estimate2(<8 x float> %x) #0 { +; SSE-LABEL: v8f32_no_estimate2: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: divps %xmm0, %xmm2 # [10:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: # xmm3 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: divps %xmm1, %xmm3 # [10:???] +; SSE-NEXT: movaps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] +; SSE-NEXT: retq +; +; AVX-RECIP-LABEL: v8f32_no_estimate2: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; AVX-RECIP-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [10:???] +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: v8f32_no_estimate2: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; FMA-RECIP-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [10:???] +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: v8f32_no_estimate2: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [5:???] +; BTVER2-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:???] +; BTVER2-NEXT: retq +; +; SANDY-LABEL: v8f32_no_estimate2: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; SANDY-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [12:???] +; SANDY-NEXT: retq +; +; HASWELL-LABEL: v8f32_no_estimate2: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; HASWELL-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:???] +; HASWELL-NEXT: retq +; +; HASWELL-NO-FMA-LABEL: v8f32_no_estimate2: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; HASWELL-NO-FMA-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:???] +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: v8f32_no_estimate2: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps {{.*}}(%rip), %ymm1 # [4:???] +; AVX512-NEXT: # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # [19:???] +; AVX512-NEXT: retq + %div = fdiv fast <8 x float> , %x + ret <8 x float> %div +} + define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 -; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm1 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm2, %xmm3 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm4, %xmm3 # [1:???] +; SSE-NEXT: rcpps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm2 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: movaps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; AVX-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [5:???] +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:???] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:???] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:???] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:???] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:???] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:???] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:???] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -791,110 +1094,116 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step_2_divs: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subps %xmm0, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: rcpps %xmm1, %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm0, %xmm3 -; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] -; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: subps %xmm0, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: addps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm0, %xmm1 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:???] +; SSE-NEXT: # xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] +; SSE-NEXT: mulps %xmm3, %xmm1 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm4, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_one_step_2_divs: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; AVX-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step_2_divs: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:???] +; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [5:???] +; BTVER2-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:???] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:???] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:???] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [7:???] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:???] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; SANDY-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:???] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:???] +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:???] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:???] +; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:???] +; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:???] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:???] +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x @@ -904,142 +1213,147 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: movaps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm4 # [1:???] +; SSE-NEXT: movaps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: # xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SSE-NEXT: movaps %xmm0, %xmm5 # [1:???] +; SSE-NEXT: subps %xmm4, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm3, %xmm5 # [1:???] +; SSE-NEXT: addps %xmm3, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm1 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm3 # [1:???] +; SSE-NEXT: subps %xmm1, %xmm3 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm3 # [1:???] +; SSE-NEXT: addps %xmm5, %xmm3 # [1:???] +; SSE-NEXT: rcpps %xmm2, %xmm1 # [1:???] +; SSE-NEXT: movaps %xmm2, %xmm4 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm4 # [1:???] +; SSE-NEXT: movaps %xmm0, %xmm5 # [1:???] +; SSE-NEXT: subps %xmm4, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm1, %xmm5 # [1:???] +; SSE-NEXT: addps %xmm1, %xmm5 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm2 # [1:???] +; SSE-NEXT: subps %xmm2, %xmm0 # [1:???] +; SSE-NEXT: mulps %xmm5, %xmm0 # [1:???] +; SSE-NEXT: addps %xmm5, %xmm0 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 # [4:???] +; SSE-NEXT: movaps %xmm3, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_two_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [4:???] +; AVX-RECIP-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 # [1:???] +; FMA-RECIP-NEXT: vmovaps {{.*}}(%rip), %ymm2 # [4:???] +; FMA-RECIP-NEXT: # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:???] +; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [5:???] +; BTVER2-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # [2:???] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [2:???] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:???] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [2:???] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:???] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [2:???] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [2:???] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # [5:???] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [5:???] +; SANDY-NEXT: vmovaps {{.*}}(%rip), %ymm3 # [4:???] +; SANDY-NEXT: # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:???] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [5:???] +; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:???] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:???] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:???] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:???] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:???] +; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # [5:???] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # [3:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # [7:???] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; KNL-NEXT: vmovaps %ymm1, %ymm3 # [1:???] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:???] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:???] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:???] +; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; SKX-NEXT: vmovaps %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # [1:???] +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # [5:???] +; SKX-NEXT: vmovaps %ymm1, %ymm3 # [1:???] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:???] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:???] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:???] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1048,48 +1362,48 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm0, %xmm0 -; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm0 # [1:???] +; SSE-NEXT: rcpps %xmm1, %xmm1 # [1:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # [2:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_no_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # [5:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # [7:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # [7:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_no_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm0 # [7:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # [1:???] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1098,58 +1412,58 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # BB#0: -; SSE-NEXT: rcpps %xmm1, %xmm1 -; SSE-NEXT: rcpps %xmm0, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: rcpps %xmm1, %xmm1 # [1:???] +; SSE-NEXT: rcpps %xmm0, %xmm0 # [1:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 # [4:???] +; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 # [4:???] ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v8f32_no_step2: ; AVX-RECIP: # BB#0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] +; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_step2: ; FMA-RECIP: # BB#0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 # [1:???] +; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:???] ; FMA-RECIP-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # [2:???] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:???] ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # [5:???] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # [7:???] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # [7:???] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; HASWELL-NO-FMA-NEXT: retq ; ; KNL-LABEL: v8f32_no_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm0 # [7:???] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # [1:???] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:???] ; SKX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div Index: test/CodeGen/X86/recip-pic.ll =================================================================== --- test/CodeGen/X86/recip-pic.ll +++ test/CodeGen/X86/recip-pic.ll @@ -1,27 +1,110 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic -print-latency | FileCheck %s --check-prefix=CHECK -define fastcc float @foo(float %x) unnamed_addr #0 { -; CHECK-LABEL: foo: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: calll .L0$pb +define float @f32_no_estimate_2(float %x) #0 { +; CHECK-LABEL: f32_no_estimate_2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:???] ; CHECK-NEXT: .Lcfi0: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: calll .L0$pb # [1:???] +; CHECK-NEXT: .Lcfi1: ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: ; CHECK-NEXT: popl %eax -; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .Lcfi2: ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: # [1:???] ; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: divss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: movss %xmm1, (%eax) +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm0 # [3:???] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: divss {{[0-9]+}}(%esp), %xmm0 # [37:???] +; CHECK-NEXT: movss %xmm0, (%eax) # [1:???] +; CHECK-NEXT: movss %xmm0, (%esp) # [1:???] +; CHECK-NEXT: flds (%esp) # [3:???] +; CHECK-NEXT: popl %eax # [3:???] ; CHECK-NEXT: retl -entry: %div = fdiv fast float 3.0, %x store float %div, float* undef, align 4 ret float %div } +define float @f32_one_step(float %x) #1 { +; CHECK-LABEL: f32_one_step: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:???] +; CHECK-NEXT: .Lcfi3: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # [3:???] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L1$pb # [1:???] +; CHECK-NEXT: .Lcfi4: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L1$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi5: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp1: # [1:???] +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [3:???] +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # [5:???] +; CHECK-NEXT: mulss %xmm1, %xmm0 # [5:???] +; CHECK-NEXT: subss %xmm0, %xmm2 # [3:???] +; CHECK-NEXT: mulss %xmm1, %xmm2 # [5:???] +; CHECK-NEXT: addss %xmm1, %xmm2 # [3:???] +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [8:???] +; CHECK-NEXT: movss %xmm2, (%eax) # [1:???] +; CHECK-NEXT: movss %xmm2, (%esp) # [1:???] +; CHECK-NEXT: flds (%esp) # [3:???] +; CHECK-NEXT: popl %eax # [3:???] +; CHECK-NEXT: retl + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} + +define float @f32_two_steps(float %x) #2 { +; CHECK-LABEL: f32_two_steps: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax # [1:???] +; CHECK-NEXT: .Lcfi6: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # [3:???] +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: calll .L2$pb # [1:???] +; CHECK-NEXT: .Lcfi7: +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L2$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .Lcfi8: +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp2: # [1:???] +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %eax +; CHECK-NEXT: movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [3:???] +; CHECK-NEXT: # xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: rcpss %xmm0, %xmm1 # [5:???] +; CHECK-NEXT: movaps %xmm0, %xmm2 # [1:???] +; CHECK-NEXT: movaps %xmm3, %xmm4 # [1:???] +; CHECK-NEXT: mulss %xmm1, %xmm2 # [5:???] +; CHECK-NEXT: subss %xmm2, %xmm4 # [3:???] +; CHECK-NEXT: mulss %xmm1, %xmm4 # [5:???] +; CHECK-NEXT: addss %xmm1, %xmm4 # [3:???] +; CHECK-NEXT: mulss %xmm4, %xmm0 # [5:???] +; CHECK-NEXT: subss %xmm0, %xmm3 # [3:???] +; CHECK-NEXT: mulss %xmm4, %xmm3 # [5:???] +; CHECK-NEXT: addss %xmm4, %xmm3 # [3:???] +; CHECK-NEXT: mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [8:???] +; CHECK-NEXT: movss %xmm3, (%eax) # [1:???] +; CHECK-NEXT: movss %xmm3, (%esp) # [1:???] +; CHECK-NEXT: flds (%esp) # [3:???] +; CHECK-NEXT: popl %eax # [3:???] +; CHECK-NEXT: retl + %div = fdiv fast float 3.0, %x + store float %div, float* undef, align 4 + ret float %div +} +attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } +attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } +attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } Index: tools/llc/llc.cpp =================================================================== --- tools/llc/llc.cpp +++ tools/llc/llc.cpp @@ -99,6 +99,10 @@ static cl::opt ShowMCEncoding("show-mc-encoding", cl::Hidden, cl::desc("Show encoding in .s output")); +static cl::opt PrintLatency("print-latency", cl::Hidden, + cl::init(false), + cl::desc("Print latency of instructions in .s output")); + static cl::opt EnableDwarfDirectory( "enable-dwarf-directory", cl::Hidden, cl::desc("Use .file directives with an explicit directory.")); @@ -446,6 +450,7 @@ TargetOptions Options = InitTargetOptionsFromCodeGenFlags(); Options.DisableIntegratedAS = NoIntegratedAssembler; Options.MCOptions.ShowMCEncoding = ShowMCEncoding; + Options.MCOptions.PrintLatency = PrintLatency; Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory; Options.MCOptions.AsmVerbose = AsmVerbose; Options.MCOptions.PreserveAsmComments = PreserveComments;