Index: include/llvm/MC/MCSchedule.h =================================================================== --- include/llvm/MC/MCSchedule.h +++ include/llvm/MC/MCSchedule.h @@ -180,6 +180,12 @@ unsigned MispredictPenalty; static const unsigned DefaultMispredictPenalty = 10; + // FdivLatency is the expected latency of single precision fdiv instruction. + // + // Most of microarchitecture have high latency for floating point division. + unsigned FdivLatency; + static const unsigned DefaultFdivLatency = 14; + bool PostRAScheduler; // default value is false bool CompleteModel; Index: include/llvm/Target/TargetSchedule.td =================================================================== --- include/llvm/Target/TargetSchedule.td +++ include/llvm/Target/TargetSchedule.td @@ -84,6 +84,7 @@ int LoadLatency = -1; // Cycles for loads to access the cache. int HighLatency = -1; // Approximation of cycles for "high latency" ops. int MispredictPenalty = -1; // Extra cycles for a mispredicted branch. + int FdivLatency = -1; // Single precision fdiv latency // Per-cycle resources tables. ProcessorItineraries Itineraries = NoItineraries; Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -121,6 +121,7 @@ const TargetMachine *TM; const TargetLowering *TLI; const TargetTransformInfo *TTI; + const TargetSubtargetInfo *STI; const TargetLibraryInfo *TLInfo; /// As we scan instructions optimizing them, this is the next instruction @@ -214,8 +215,10 @@ PromotedInsts.clear(); ModifiedDT = false; - if (TM) - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + if (TM) { + STI = TM->getSubtargetImpl(F); + TLI = STI->getTargetLowering(); + } TLInfo = &getAnalysis().getTLI(); TTI = &getAnalysis().getTTI(F); OptSize = F.optForSize(); @@ -4500,6 +4503,7 @@ /// Returns true if a SelectInst should be turned into an explicit branch. static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, + const TargetSubtargetInfo *STI, SelectInst *SI) { // FIXME: This should use the same heuristics as IfConversion to determine // whether a select is better represented as a branch. This requires that @@ -4507,11 +4511,31 @@ // case currently. CmpInst *Cmp = dyn_cast(SI->getCondition()); + if (!Cmp) + return false; + + Value *CmpOp0 = Cmp->getOperand(0); + Value *CmpOp1 = Cmp->getOperand(1); + + // Emit "cmov on compare with a expensive operand" as a branch to avoid stalls + // on executing expensive instruction likes division. + auto IsExpensiveCostInst = [&](Value *V) -> bool { + auto *I = dyn_cast(V); + if (I && I->getOpcode() == Instruction::FDiv && + STI->getSchedModel().FdivLatency > + STI->getSchedModel().MispredictPenalty) + return true; + + return false; + }; + + if (IsExpensiveCostInst(CmpOp0) || IsExpensiveCostInst(CmpOp1)) + return true; // If a branch is predictable, an out-of-order CPU can avoid blocking on its // comparison condition. If the compare has more than one use, there's // probably another cmov or setcc around, so it's not worth emitting a branch. - if (!Cmp || !Cmp->hasOneUse()) + if (!Cmp->hasOneUse()) return false; // If either operand of the select is expensive and only needed on one side @@ -4530,7 +4554,7 @@ bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); // Can we convert the 'select' to CF ? - if (DisableSelectToBranch || OptSize || !TLI || VectorCond) + if (DisableSelectToBranch || OptSize || !TLI || !STI || VectorCond) return false; TargetLowering::SelectSupportKind SelectKind; @@ -4546,7 +4570,7 @@ // We have efficient codegen support for the select instruction. // Check if it is profitable to keep this 'select'. if (!TLI->isPredictableSelectExpensive() || - !isFormingBranchFromSelectProfitable(TTI, SI)) + !isFormingBranchFromSelectProfitable(TTI, STI, SI)) return false; } Index: lib/MC/MCSchedule.cpp =================================================================== --- lib/MC/MCSchedule.cpp +++ lib/MC/MCSchedule.cpp @@ -24,6 +24,7 @@ DefaultLoadLatency, DefaultHighLatency, DefaultMispredictPenalty, + DefaultFdivLatency, false, true, 0, Index: lib/Target/AArch64/AArch64SchedA57.td =================================================================== --- lib/Target/AArch64/AArch64SchedA57.td +++ lib/Target/AArch64/AArch64SchedA57.td @@ -26,6 +26,7 @@ let MicroOpBufferSize = 128; // 128 micro-op re-order buffer let LoadLatency = 4; // Optimistic load latency let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + let FdivLatency = 18; // Set single precision fdiv latency // Enable partial & runtime unrolling. The magic number is chosen based on // experiments and benchmarking data. Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -363,18 +363,18 @@ define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) { ; SSE-LABEL: reassociate_mins_single: ; SSE: # BB#0: -; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: minss %xmm3, %xmm2 ; SSE-NEXT: minss %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_single: ; AVX: # BB#0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 + %t0 = fmul float %x0, %x1 %cmp1 = fcmp olt float %x2, %t0 %sel1 = select i1 %cmp1, float %x2, float %t0 %cmp2 = fcmp olt float %x3, %sel1 @@ -387,18 +387,18 @@ define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) { ; SSE-LABEL: reassociate_maxs_single: ; SSE: # BB#0: -; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: maxss %xmm3, %xmm2 ; SSE-NEXT: maxss %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_single: ; AVX: # BB#0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 + %t0 = fmul float %x0, %x1 %cmp1 = fcmp ogt float %x2, %t0 %sel1 = select i1 %cmp1, float %x2, float %t0 %cmp2 = fcmp ogt float %x3, %sel1 @@ -411,18 +411,18 @@ define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) { ; SSE-LABEL: reassociate_mins_double: ; SSE: # BB#0: -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: minsd %xmm3, %xmm2 ; SSE-NEXT: minsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_double: ; AVX: # BB#0: -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 + %t0 = fmul double %x0, %x1 %cmp1 = fcmp olt double %x2, %t0 %sel1 = select i1 %cmp1, double %x2, double %t0 %cmp2 = fcmp olt double %x3, %sel1 @@ -435,18 +435,18 @@ define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) { ; SSE-LABEL: reassociate_maxs_double: ; SSE: # BB#0: -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: maxsd %xmm3, %xmm2 ; SSE-NEXT: maxsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_double: ; AVX: # BB#0: -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 + %t0 = fmul double %x0, %x1 %cmp1 = fcmp ogt double %x2, %t0 %sel1 = select i1 %cmp1, double %x2, double %t0 %cmp2 = fcmp ogt double %x3, %sel1 Index: test/Transforms/CodeGenPrepare/X86/select.ll =================================================================== --- test/Transforms/CodeGenPrepare/X86/select.ll +++ test/Transforms/CodeGenPrepare/X86/select.ll @@ -134,3 +134,18 @@ ; CHECK: %sel = select i1 %cmp, i32 %div1, i32 %div2 } +; Nothing to sink here, but this gets converted to a branch to +; avoid stalling an out-of-order CPU on a predictable branch. +; Because cmp's operand is expensive instruction likes division. + +define float @fdiv_do_transform(float %a, float %b) { +entry: + %div = fdiv float %a, %b + %cmp = fcmp ogt float %div, %b + %sel = select i1 %cmp, float %div, float 8.0 + ret float %sel + +; CHECK-LABEL: @fdiv_do_transform( +; CHECK: br i1 %cmp, label %select.end, label %select.false +} + Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -1157,6 +1157,7 @@ EmitProcessorProp(OS, PM.ModelDef, "LoadLatency", ','); EmitProcessorProp(OS, PM.ModelDef, "HighLatency", ','); EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ','); + EmitProcessorProp(OS, PM.ModelDef, "FdivLatency", ','); OS << " " << (bool)(PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : 0)