diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -294,6 +294,14 @@ "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +static cl::opt LocalHotnessThreshold( + "local-hotness-threshold", cl::init(500), cl::Hidden, + cl::desc( + "In cases when there is no info on block hotness available from module " + "profile we define \"local hotness\" as a ratio of the block to " + "function entry execution counts. If the ration is greater than the " + "threshold defined by this parameter the block is said to be locally " + "hot.")); /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -7439,6 +7447,39 @@ (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) && Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) SEL = CM_ScalarEpilogueNotNeededUsePredicate; + else { + auto ExpectedTC = getSmallBestKnownTC(*SE, L); + // Check the loop for a trip count threshold: vectorize loops with a tiny + // trip count by optimizing for size, to minimize overheads. + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { + // Even short trip count loops may be hot (part of hot region). + // In absence of profile summary estimate loop hotness relative to + // function entry using execution frequency information. + if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) { + Optional LoopCount = + BFI->getBlockProfileCount(L->getHeader(), true); + Optional FunctionCount = + BFI->getBlockProfileCount(&F->getEntryBlock(), true); + if (LoopCount && FunctionCount && + (*LoopCount > *FunctionCount * LocalHotnessThreshold)) { + LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to " + "hotness considerations."); + return CM_ScalarEpilogueAllowed; + } + } + + LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); + + if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) + LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + LLVM_DEBUG(dbgs() << "\n"); + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + } + } + } return SEL; } @@ -7567,21 +7608,6 @@ assert(L->empty() && "Inner loop expected."); - // Check the loop for a trip count threshold: vectorize loops with a tiny trip - // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); - if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { - LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); - if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); - else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; - } - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer diff --git a/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll @@ -0,0 +1,205 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -S < %s 2>&1 | FileCheck %s + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is not constant and its value is estimated by profile. + +; ModuleID = 'test.cpp' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [5 x i32] zeroinitializer, align 16 +@b = dso_local global [5 x i32] zeroinitializer, align 16 + +; Function Attrs: uwtable +define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 { +; CHECK: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]] +; CHECK: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]] +; CHECK: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]] +; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]] +; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]] +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is known constant value. + +; Function Attrs: uwtable +define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 { +; CHECK: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]] +; CHECK: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]] +; CHECK: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]] +; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]] +; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.body: ; preds = %entry, %for.cond.cleanup3 + %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4 + %inc8 = add nuw nsw i32 %j.018, 1 + %cmp = icmp ult i32 %inc8, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13 + +for.body4: ; preds = %for.body, %for.body4 + %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ] + %idxprom = zext i32 %i.017 to i64 + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul = mul nsw i32 %0, %i.017 + %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2 + %add = add nsw i32 %1, %mul + store i32 %add, i32* %arrayidx6, align 4, !tbaa !2 + %inc = add nuw nsw i32 %i.017, 1 + %cmp2 = icmp ult i32 %inc, 5 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +} + +; This is negative test. Check that vectorization is not performed for COLD +; short trip count loop requiring epilog. Note that outer loop has only 20 +; iterations and there is no associated profile info. + + +; Function Attrs: uwtable +define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !11 { +; CHECK: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US:%.*]] +; CHECK: [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[TMP3:%.*]] +; CHECK: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX6_US:%.*]] +; CHECK: [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]] +; CHECK: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]] +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !14 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +declare dso_local void @_Z3barPi(i32*) local_unnamed_addr + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +attributes #0 = { "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.isvectorized", i32 1} +!8 = distinct !{!8, !9, !7} +!9 = !{!"llvm.loop.unroll.runtime.disable"} +!10 = !{!"branch_weights", i32 999, i32 4995} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 1, i32 999} +!13 = !{!"branch_weights", i32 1000, i32 1} +!14 = !{!"branch_weights", i32 9, i32 45}