Index: llvm/trunk/lib/Target/X86/X86CmovConversion.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86CmovConversion.cpp +++ llvm/trunk/lib/Target/X86/X86CmovConversion.cpp @@ -67,6 +67,11 @@ cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden); +static cl::opt + GainCycleThreshold("x86-cmov-converter-threshold", + cl::desc("Minimum gain per loop (in cycles) threshold."), + cl::init(4), cl::Hidden); + /// Converts X86 cmov instructions into branches when profitable. class X86CmovConverterPass : public MachineFunctionPass { public: @@ -389,19 +394,28 @@ // Critical-path is iteration dependent - there is dependency of // critical-path instructions on critical-path instructions of // previous iteration. - // Thus, it is required to check the gradient of the gain - the - // change in Depth-Diff compared to the change in Loop-Depth between - // 1st and 2nd iterations. + // Thus, check the gain percent of the 2nd iteration (similar to the + // previous case), but it is also required to check the gradient of + // the gain - the change in Depth-Diff compared to the change in + // Loop-Depth between 1st and 2nd iterations. // To be conservative, the gradient need to be at least 50%. // + // In addition, In order not to optimize loops with very small gain, the + // gain (in cycles) after 2nd iteration should not be less than a given + // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. + // // If loop is not worth optimizing, remove all CMOV-group-candidates. //===--------------------------------------------------------------------===// + if (Diff[1] < GainCycleThreshold) + return false; + bool WorthOptLoop = false; if (Diff[1] == Diff[0]) WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; else if (Diff[1] > Diff[0]) WorthOptLoop = - (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth); + (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && + (Diff[1] * 8 >= LoopDepth[1].Depth); if (!WorthOptLoop) return false; Index: llvm/trunk/test/CodeGen/X86/pr33954.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr33954.ll +++ llvm/trunk/test/CodeGen/X86/pr33954.ll @@ -0,0 +1,91 @@ +; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; This test checks that x86-cmov-converter optimization does not transform CMOV +;; instruction when the gain (in cycles) of converting to branch is less than +;; a fix threshold (measured for "-x86-cmov-converter-threshold=4"). +;; +;; Test was created using the following command line: +;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - +;; Where foo.c is: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;int bar(int *a, int *b, int n) { +;; int sum = 0; +;; for (int i = 0; i < n; ++i) { +;; int x = a[i] * a[i+1] * a[i+2]; +;; int y = b[i] * b[i+1]; +;; sum += y > x ? x : 0; +;; } +;; return sum; +;;} +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Adding a test to the above function shows code with CMOV is 25% faster than +;; the code with branch. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;#define N 10000 +;;int A[N]; +;;int B[N]; +;; +;; +;; +;;int main () { +;; for (int i=0; i< N; ++i) { +;; A[i] = i%4; +;; B[i] = i%5; +;; } +;; int sum = 0; +;; for (int i=0; i< N*10; ++i) +;; sum += bar(A, B, N); +;; return sum; +;;} +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; CHECK-NOT: jg +; CHECK: cmovle +define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 { +entry: + %cmp30 = icmp sgt i32 %n, 0 + br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %.pre = load i32, i32* %a, align 4 + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1 + %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + %.pre35 = load i32, i32* %b, align 4 + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %for.body, %for.body.preheader + %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ] + %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ] + %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %mul = mul nsw i32 %1, %2 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3 + %4 = load i32, i32* %arrayidx5, align 4 + %mul6 = mul nsw i32 %mul, %4 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next + %5 = load i32, i32* %arrayidx11, align 4 + %mul12 = mul nsw i32 %5, %0 + %cmp13 = icmp sgt i32 %mul12, %mul6 + %cond = select i1 %cmp13, i32 %mul6, i32 0 + %add14 = add nsw i32 %cond, %sum.032 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = {"target-cpu"="skylake"} + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 2} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 5.0.0 (trunk)"} Index: llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll +++ llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll @@ -3,13 +3,13 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This test checks that x86-cmov-converter optimization transform CMOV ;; instruction into branches when it is profitable. -;; There are 5 cases below: +;; There are 6 cases below: ;; 1. CmovInCriticalPath: ;; CMOV depends on the condition and it is in the hot path. ;; Thus, it worths transforming. ;; ;; 2. CmovNotInCriticalPath: -;; similar test like in (1), just that CMOV is not in the hot path. +;; Similar test like in (1), just that CMOV is not in the hot path. ;; Thus, it does not worth transforming. ;; ;; 3. MaxIndex: @@ -26,16 +26,21 @@ ;; Usually, binary search CMOV is not predicted. ;; Thus, it does not worth transforming. ;; +;; 6. SmallGainPerLoop: +;; The gain percentage from converting CMOV into branch is acceptable, +;; however, the absolute gain is smaller than a threshold. +;; Thus, it does not worth transforming. +;; ;; Test was created using the following command line: ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - ;; Where foo.c is: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) { ;; for (int i = 0; i < n; i++) { -;; int t = c[i]; +;; int t = c[i] + 1; ;; if (c[i] * a > b) ;; t = 10; -;; c[i] = t; +;; c[i] = (c[i] + 1) * t; ;; } ;;} ;; @@ -87,6 +92,16 @@ ;; } ;; return Curr->Val; ;;} +;; +;; +;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) { +;; for (int i = 0; i < n; i++) { +;; int t = c[i]; +;; if (c[i] * a > b) +;; t = 10; +;; c[i] = t; +;; } +;;} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %struct.Node = type { i32, %struct.Node*, %struct.Node* } @@ -111,10 +126,12 @@ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 %mul = mul nsw i32 %0, %a %cmp3 = icmp sgt i32 %mul, %b - %. = select i1 %cmp3, i32 10, i32 %0 - store i32 %., i32* %arrayidx, align 4 + %. = select i1 %cmp3, i32 10, i32 %add + %mul7 = mul nsw i32 %., %add + store i32 %mul7, i32* %arrayidx, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond, label %for.cond.cleanup, label %for.body @@ -253,6 +270,35 @@ ret i32 %.lcssa } +; CHECK-LABEL: SmallGainPerLoop +; CHECK-NOT: jg +; CHECK: cmovg + +define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 { +entry: + %cmp14 = icmp sgt i32 %n, 0 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %a + %cmp3 = icmp sgt i32 %mul, %b + %. = select i1 %cmp3, i32 10, i32 %0 + store i32 %., i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The following test checks that x86-cmov-converter optimization transforms ;; CMOV instructions into branch correctly.