Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -4470,11 +4470,29 @@ // case currently. CmpInst *Cmp = dyn_cast(SI->getCondition()); + if (!Cmp) + return false; + + Value *CmpOp0 = Cmp->getOperand(0); + Value *CmpOp1 = Cmp->getOperand(1); + + // Emit "cmov on compare with a expensive operand" as a branch to avoid stalls + // on executing expensive instruction likes division. + auto IsExpensiveCostInst = [&](Value *V) -> bool { + auto *I = dyn_cast(V); + if (I && I->getOpcode() == Instruction::FDiv) + return true; + + return false; + }; + + if (IsExpensiveCostInst(CmpOp0) || IsExpensiveCostInst(CmpOp1)) + return true; // If a branch is predictable, an out-of-order CPU can avoid blocking on its // comparison condition. If the compare has more than one use, there's // probably another cmov or setcc around, so it's not worth emitting a branch. - if (!Cmp || !Cmp->hasOneUse()) + if (!Cmp->hasOneUse()) return false; // If either operand of the select is expensive and only needed on one side Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -363,18 +363,18 @@ define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) { ; SSE-LABEL: reassociate_mins_single: ; SSE: # BB#0: -; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: minss %xmm3, %xmm2 ; SSE-NEXT: minss %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_single: ; AVX: # BB#0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 + %t0 = fmul float %x0, %x1 %cmp1 = fcmp olt float %x2, %t0 %sel1 = select i1 %cmp1, float %x2, float %t0 %cmp2 = fcmp olt float %x3, %sel1 @@ -387,18 +387,18 @@ define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) { ; SSE-LABEL: reassociate_maxs_single: ; SSE: # BB#0: -; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: maxss %xmm3, %xmm2 ; SSE-NEXT: maxss %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_single: ; AVX: # BB#0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv float %x0, %x1 + %t0 = fmul float %x0, %x1 %cmp1 = fcmp ogt float %x2, %t0 %sel1 = select i1 %cmp1, float %x2, float %t0 %cmp2 = fcmp ogt float %x3, %sel1 @@ -411,18 +411,18 @@ define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) { ; SSE-LABEL: reassociate_mins_double: ; SSE: # BB#0: -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: minsd %xmm3, %xmm2 ; SSE-NEXT: minsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_double: ; AVX: # BB#0: -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 + %t0 = fmul double %x0, %x1 %cmp1 = fcmp olt double %x2, %t0 %sel1 = select i1 %cmp1, double %x2, double %t0 %cmp2 = fcmp olt double %x3, %sel1 @@ -435,18 +435,18 @@ define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) { ; SSE-LABEL: reassociate_maxs_double: ; SSE: # BB#0: -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: maxsd %xmm3, %xmm2 ; SSE-NEXT: maxsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_double: ; AVX: # BB#0: -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq - %t0 = fdiv double %x0, %x1 + %t0 = fmul double %x0, %x1 %cmp1 = fcmp ogt double %x2, %t0 %sel1 = select i1 %cmp1, double %x2, double %t0 %cmp2 = fcmp ogt double %x3, %sel1 Index: test/Transforms/CodeGenPrepare/X86/select.ll =================================================================== --- test/Transforms/CodeGenPrepare/X86/select.ll +++ test/Transforms/CodeGenPrepare/X86/select.ll @@ -134,3 +134,18 @@ ; CHECK: %sel = select i1 %cmp, i32 %div1, i32 %div2 } +; Nothing to sink here, but this gets converted to a branch to +; avoid stalling an out-of-order CPU on a predictable branch. +; Because cmp's operand is expensive instruction likes division. + +define float @fdiv_do_transform(float %a, float %b) { +entry: + %div = fdiv float %a, %b + %cmp = fcmp ogt float %div, %b + %sel = select i1 %cmp, float %div, float 8.0 + ret float %sel + +; CHECK-LABEL: @fdiv_do_transform( +; CHECK: br i1 %cmp, label %select.end, label %select.false +} +