Index: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -87,8 +87,6 @@ std::function *GetLAA; OptimizationRemarkEmitter *ORE; - BlockFrequency ColdEntryFreq; - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); // Shim for old PM. Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5702,14 +5702,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // We should not collect Uniforms more than once per VF. Right now, - // this function is called from collectUniformsAndScalars(), which + // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. assert(VF >= 2 && !Uniforms.count(VF) && "This function should not be visited twice for the same VF"); - // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // Visit the list of Uniforms. If we'll not find any uniform value, we'll // not analyze again. Uniforms.count(VF) will return 1. Uniforms[VF].clear(); @@ -5988,10 +5988,10 @@ continue; Value *Ptr = getPointerOperand(&I); - // We don't check wrapping here because we don't know yet if Ptr will be - // part of a full group or a group with gaps. Checking wrapping for all + // We don't check wrapping here because we don't know yet if Ptr will be + // part of a full group or a group with gaps. Checking wrapping for all // pointers (even those that end up in groups with no gaps) will be overly - // conservative. For full groups, wrapping should be ok since if we would + // conservative. For full groups, wrapping should be ok since if we would // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. @@ -6244,7 +6244,7 @@ Instruction *LastMember = Group->getMember(Group->getFactor() - 1); if (LastMember) { Value *LastMemberPtr = getPointerOperand(LastMember); - if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, + if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " "last group member potentially pointer-wrapping.\n"); @@ -6252,9 +6252,9 @@ } } else { // Case 3: A non-reversed interleaved load group with gaps: We need - // to execute at least one scalar epilogue iteration. This will ensure + // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need - // to look for a member at index factor - 1, since every group must have + // to look for a member at index factor - 1, since every group must have // a member at index zero. if (Group->isReverse()) { releaseGroup(Group); @@ -7789,8 +7789,18 @@ // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L); - if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) { + unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); + bool HasExpectedTC = (ExpectedTC > 0); + + if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { + auto EstimatedTC = getLoopEstimatedTripCount(L); + if (EstimatedTC) { + ExpectedTC = *EstimatedTC; + HasExpectedTC = true; + } + } + + if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) @@ -7822,18 +7832,6 @@ bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); - // Compute the weighted frequency of this loop being executed and see if it - // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectorize. - // FIXME: This is hidden behind a flag due to pervasive problems with - // exactly what block frequency models. - if (LoopVectorizeWithBlockFrequency) { - BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); - if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - LoopEntryFreq < ColdEntryFreq) - OptForSize = true; - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer @@ -8015,11 +8013,6 @@ DB = &DB_; ORE = &ORE_; - // Compute some weights outside of the loop over the loops. Compute this - // using a BranchProbability to re-use its scaling math. - const BranchProbability ColdProb(1, 5); // 20% - ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; - // Don't attempt if // 1. the target claims to have no vector registers, and // 2. interleaving won't help ILP. Index: llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll @@ -115,32 +115,6 @@ ret void } -; N is unknown, we need a tail. Can't vectorize because the loop is cold. -;CHECK-LABEL: @example4( -;CHECK-NOT: <4 x i32> -;CHECK: ret void -define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) { - %1 = icmp eq i32 %n, 0 - br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0 - -.lr.ph: ; preds = %0, %.lr.ph - %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ] - %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ] - %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ] - %2 = add nsw i32 %.05, -1 - %3 = getelementptr inbounds i32, i32* %.023, i64 1 - %4 = load i32, i32* %.023, align 16 - %5 = getelementptr inbounds i32, i32* %.014, i64 1 - store i32 %4, i32* %.014, align 16 - %6 = icmp eq i32 %2, 0 - br i1 %6, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - ret void -} - -!0 = !{!"branch_weights", i32 64, i32 4} - ; We can't vectorize this one because we need a runtime ptr check. ;CHECK-LABEL: @example23( ;CHECK-NOT: <4 x i32> Index: llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll +++ llvm/trunk/test/Transforms/LoopVectorize/tripcount.ll @@ -0,0 +1,91 @@ +; This test verifies that the loop vectorizer will not vectorizes low trip count +; loops that require runtime checks (Trip count is computed with profile info). +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s + +target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" + +@tab = common global [32 x i8] zeroinitializer, align 1 + +define i32 @foo_low_trip_count1(i32 %bound) { +; Simple loop with low tripcount. Should not be vectorized. + +; CHECK-LABEL: @foo_low_trip_count1( +; CHECK-NOT: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @foo_low_trip_count2(i32 %bound) !prof !0 { +; The loop has a same invocation count with the function, but has a low +; trip_count per invocation and not worth to vectorize. + +; CHECK-LABEL: @foo_low_trip_count2( +; CHECK-NOT: <{{[0-9]+}} x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 { +; The loop has low invocation count compare to the function invocation count, +; but has a high trip count per invocation. Vectorize it. + +; CHECK-LABEL: @foo_low_trip_count3( +; CHECK: vector.body: + +entry: + br i1 %cond, label %for.preheader, label %for.end, !prof !2 + +for.preheader: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !3 + +for.end: ; preds = %for.body + ret i32 0 +} + + +!0 = !{!"function_entry_count", i64 100} +!1 = !{!"branch_weights", i32 100, i32 0} +!2 = !{!"branch_weights", i32 10, i32 90} +!3 = !{!"branch_weights", i32 10, i32 10000}