Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -208,7 +208,7 @@ "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt LoopVectorizeWithBlockFrequency( - "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); @@ -8347,9 +8347,21 @@ // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); - bool HasExpectedTC = (ExpectedTC > 0); - + // Prefer constant trip counts over profile data, over upper bound estimate. + unsigned ExpectedTC = 0; + bool HasExpectedTC = false; + if (const SCEVConstant *ConstExits = + dyn_cast(SE->getBackedgeTakenCount(L))) { + const APInt &ExitsCount = ConstExits->getAPInt(); + // We are interested in small values for ExpectedTC. Skip over those that + // can't fit an unsigned. + if (ExitsCount.ult(std::numeric_limits::max())) { + ExpectedTC = static_cast(ExitsCount.getZExtValue()) + 1; + HasExpectedTC = true; + } + } + // ExpectedTC may be large because it's bound by a variable. Check + // profiling information to validate we should vectorize. if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { auto EstimatedTC = getLoopEstimatedTripCount(L); if (EstimatedTC) { @@ -8357,6 +8369,10 @@ HasExpectedTC = true; } } + if (!HasExpectedTC) { + ExpectedTC = SE->getSmallConstantMaxTripCount(L); + HasExpectedTC = (ExpectedTC > 0); + } if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " Index: llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll +++ llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll @@ -57,7 +57,7 @@ } define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 { -; The loop has low invocation count compare to the function invocation count, +; The loop has low invocation count compare to the function invocation count, ; but has a high trip count per invocation. Vectorize it. ; CHECK-LABEL: @foo_low_trip_count3( @@ -84,6 +84,126 @@ ret i32 0 } +define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) { +; Simple loop with low tripcount and inequality test for exit. +; Should not be vectorized. + +; CHECK-LABEL: @foo_low_trip_count_icmp_sgt( +; CHECK-NOT: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp sgt i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_low_trip_count() { +; Simple loop with constant, small trip count and no profiling info. + +; CHECK-LABEL: @const_low_trip_count +; CHECK-NOT: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 2 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_large_trip_count() { +; Simple loop with constant large trip count and no profiling info. + +; CHECK-LABEL: @const_large_trip_count +; CHECK: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_small_trip_count_step() { +; Simple loop with static, small trip count and no profiling info. + +; CHECK-LABEL: @const_small_trip_count_step +; CHECK-NOT: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 5 + %exitcond = icmp slt i32 %i.08, 10 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_trip_over_profile() { +; constant trip count takes precedence over profile data + +; CHECK-LABEL: @const_trip_over_profile +; CHECK: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 1000 + br i1 %exitcond, label %for.body, label %for.end, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} !0 = !{!"function_entry_count", i64 100} !1 = !{!"branch_weights", i32 100, i32 0}