diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -168,9 +168,9 @@ /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + unsigned Width = 0; // Cost of the loop with that width - unsigned Cost; + unsigned Cost = 0; // Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { return {1, 0}; } @@ -243,8 +243,9 @@ void setBestPlan(unsigned VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + /// best selected VPlan. Returns 'true' if we successfully generated vector + /// loop, 'false' otherwise. + bool executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); void printPlans(raw_ostream &O) { for (const auto &Plan : VPlans) @@ -285,6 +286,11 @@ /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF); + + /// Returns 'false' if additional overhead from generated runtime checks (trip + /// count, memory dependency and SCEV checks) makes vectorization not + /// profitable, 'true' otherwise. + bool mayDisregardRTChecksOverhead(InnerLoopVectorizer &ILV); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -362,7 +362,7 @@ /// 2) Returns expected trip count according to profile data if any. /// 3) Returns upper bound estimate if it is known. /// 4) Returns None if all of the above failed. -static Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { +Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { // Check if exact trip count is known. if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) return ExpectedTC; @@ -927,12 +927,6 @@ // Vectorization with OptForSize: don't allow epilogues. CM_ScalarEpilogueNotAllowedOptSize, - // A special case of vectorisation with OptForSize: loops with a very small - // trip count are considered for vectorization under OptForSize, thereby - // making sure the cost of their loop body is dominant, free of runtime - // guards and scalar iteration overheads. - CM_ScalarEpilogueNotAllowedLowTripLoop, - // Loop hint predicate indicating an epilogue is undesired. CM_ScalarEpilogueNotNeededUsePredicate }; @@ -965,7 +959,7 @@ /// \return True if runtime checks are required for vectorization, and false /// otherwise. - bool runtimeChecksRequired(); + bool runtimeChecksRequired(bool ReportFailure); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -1294,13 +1288,6 @@ /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); -private: - unsigned NumPredStores = 0; - - /// \return An upper bound for the vectorization factor, larger than zero. - /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); - /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -1310,16 +1297,23 @@ /// actually taken place). using VectorizationCostTy = std::pair; + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + +private: + unsigned NumPredStores = 0; + + /// \return An upper bound for the vectorization factor, larger than zero. + /// One is returned if vectorization should best be avoided due to cost. + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(unsigned VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); @@ -1501,6 +1495,11 @@ /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + + /// Cached {VF, Cost} for scalar loop (VF==1). + VectorizationFactor ScalarVF; + /// Cached {VF, Cost} for best expected vectorization mode. + VectorizationFactor BestVF; }; } // end namespace llvm @@ -4843,34 +4842,40 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::runtimeChecksRequired() { +bool LoopVectorizationCostModel::runtimeChecksRequired(bool ReportFailure) { LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { - reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", - "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime ptr check is required with -Os/-Oz", + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { - reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", - "runtime SCEV checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime SCEV check is required with -Os/-Oz", + "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", - "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime stride check is required with -Os/-Oz", + "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } @@ -4898,27 +4903,32 @@ } switch (ScalarEpilogueStatus) { - case CM_ScalarEpilogueAllowed: + case CM_ScalarEpilogueAllowed: { + // Prefer masked vectorization for short trip count loops without + // runtime checks. That way legacy behavior is preserved. + // TODO: Ideally this decision should be done by cost model. + auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold && + !runtimeChecksRequired(false)) { + LLVM_DEBUG(dbgs() << "LV: Prefer masked vectorization for short trip " + << "count loop.\n"); + break; + } return computeFeasibleMaxVF(TC); + } case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; - case CM_ScalarEpilogueNotAllowedLowTripLoop: - // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) LLVM_DEBUG( dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); - else - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " - << "count.\n"); // Bail if runtime checks are required, which are not good when optimising // for size. - if (runtimeChecksRequired()) + if (runtimeChecksRequired(true)) return None; break; } @@ -4946,6 +4956,14 @@ return MaxVF; } + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed) { + LLVM_DEBUG( + dbgs() << "LV: Masked vectorization is not allowed. Continue with" + "'normal' vectorization using epilogue\n"); + + return MaxVF; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5041,10 +5059,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; - const float ScalarCost = Cost; + ScalarVF = { 1, expectedCost(1).first }; + float Cost = ScalarVF.Cost; unsigned Width = 1; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarVF.Cost + << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && MaxVF > 1) { @@ -5079,15 +5098,16 @@ "store that is conditionally executed prevents vectorization", "ConditionalStore", ORE, TheLoop); Width = 1; - Cost = ScalarCost; + Cost = ScalarVF.Cost; } - LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); + LLVM_DEBUG( + if (ForceVectorization && Width > 1 && Cost >= ScalarVF.Cost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; - return Factor; + BestVF = { Width, (unsigned)(Width * Cost) }; + return BestVF; } std::pair @@ -6514,7 +6534,117 @@ assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, +// Helper function to calculate cost of all instructions in the \p C. +template +static uint64_t getCostOfBlocks(LoopVectorizationCostModel &CM, + Container &&C) { + uint64_t TotalCost = 0; + for (BasicBlock *BB : C) { + for (Instruction &I : *BB) { + auto InstCost = CM.getInstructionCost(&I, 1).first; + TotalCost += InstCost; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << InstCost + << " for VF " << 1 << " For instruction: " << I + << '\n'); + } + } + return TotalCost; +} + +// Returns 'false' if additional overhead from generated runtime checks (trip +// count, memory dependency and SCEV checks) makes vectorization not profitable, +// 'true' otherwise. +bool LoopVectorizationPlanner::mayDisregardRTChecksOverhead( + InnerLoopVectorizer &ILV) { + Optional ExpectedTC = + getSmallBestKnownTC(*CM.PSE.getSE(), OrigLoop); + + // No need to check for RT overhead for loops expected not to have short + // trip count. + // TODO: This is done this was to preserve legacy behavior. We should change + // that eventually and be checking for RT overhead for all loops regardless of + // TC. + if (!ExpectedTC || *ExpectedTC >= TinyTripCountVectorThreshold) { + LLVM_DEBUG( + dbgs() << "LV: Disregarding run-time checks overhead: not short trip " + "count loop.\n"); + return true; + } + + // No need to check for RT overhead if vectorization was forced. Note that + // cost modeling still may be performed to select best VF. + if (CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) { + LLVM_DEBUG(dbgs() << "LV: Disregarding run-time checks overhead: " + "vectorization was forced.\n"); + return true; + } + + // No need to check for RT overhead if cost modeling was skipped and VF + // selected by the user. + if (CM.ScalarVF.Width == 0) { + LLVM_DEBUG( + dbgs() + << "LV: Disregarding run-time checks overhead: VF was forced.\n"); + return true; + } + + assert(CM.BestVF.Width != 0 && CM.BestVF.Width > 1 && + "Best VF was not properly selected?"); + + LLVM_DEBUG(dbgs() << "LV: Checking cost of run-time overhead for short " + "trip count loop.\n"); + + uint64_t VecTripCount = *ExpectedTC / CM.BestVF.Width; + uint64_t EpilogTripCount = *ExpectedTC % CM.BestVF.Width; + + uint64_t VecRTCost = getCostOfBlocks(CM, ILV.LoopBypassBlocks) + + getCostOfBlocks >( + CM, { ILV.LoopVectorPreHeader, ILV.LoopMiddleBlock, + ILV.LoopScalarPreHeader }); + uint64_t VecCost = CM.BestVF.Cost * VecTripCount; + uint64_t EpilogCost = CM.ScalarVF.Cost * EpilogTripCount; + uint64_t VecTotalCost = VecRTCost + VecCost + EpilogCost; + uint64_t ScalarTotalCost = CM.ScalarVF.Cost * (*ExpectedTC); + + LLVM_DEBUG(dbgs() << "LV: ScalarTotalCost = " << ScalarTotalCost << "\n"); + LLVM_DEBUG(dbgs() << "LV: VecTotalCost = RTCost + (VecCost * VecTC) + " + "(EpilogCost * EpilogTC) = " << VecRTCost << " + (" + << CM.BestVF.Cost << " * " << VecTripCount << ") + (" + << CM.ScalarVF.Cost << " * " << EpilogTripCount + << ") = " << VecTotalCost << "\n"); + + if (VecTotalCost >= ScalarTotalCost) { + assert(isa(ILV.LoopBypassBlocks.front()->getTerminator()) && + "RT check should end with branch instruction."); + + // Make vectorized loop effectively dead. Later optimizations should clean + // it up. + auto *BrInst = + cast(ILV.LoopBypassBlocks.front()->getTerminator()); + BrInst->setCondition( + ConstantInt::getTrue(BrInst->getCondition()->getType())); + + LLVM_DEBUG( + dbgs() + << "LV: It's not profitable to vectorize short trip count loop.\n"); + + ILV.ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Not Vectorized", + ILV.OrigLoop->getStartLoc(), + ILV.OrigLoop->getHeader()) + << "not profitable to vectorize short trip count loop."; + }); + + return false; + } + + LLVM_DEBUG( + dbgs() + << "LV: It's still profitable to vectorize short trip count loop.\n"); + return true; +} + +bool LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, DominatorTree *DT) { // Perform the actual loop transformation. @@ -6535,13 +6665,20 @@ // //===------------------------------------------------===// - // 2. Copy and widen instructions from the old loop into the new loop. - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlans.front()->execute(&State); + bool IsVectorizationProfitable = mayDisregardRTChecksOverhead(ILV); + // Skip generation of vector body if vectorization turned out to be not + // profitable (vector loop is dead in this case). + if (IsVectorizationProfitable) { + // 2. Copy and widen instructions from the old loop into the new loop. + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); + VPlans.front()->execute(&State); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); + } - // 3. Fix the vectorized code: take care of header phi's, live-outs, - // predication, updating analyses. - ILV.fixVectorizedLoop(); + return IsVectorizationProfitable; } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( @@ -7596,21 +7733,6 @@ assert(L->empty() && "Inner loop expected."); - // Check the loop for a trip count threshold: vectorize loops with a tiny trip - // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); - if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { - LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); - if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); - else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; - } - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer @@ -7787,23 +7909,24 @@ // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - - // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + if (LVP.executePlan(LB, DT)) { + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + + // Report the vectorization decision. + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); + } } Optional RemainderLoopID = diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -169,12 +169,12 @@ ; trip count leads to opt-for-size (which otherwise could fold the tail by ; masking). ; CHECK-LABEL: @main -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body: +; CHECK: tc.check +; CHECK: br i1 true, label %scalar.ph, label %vector.scevcheck ; CHECK-LABEL: for.cond: ; AUTOVF-LABEL: @main -; AUTOVF-NOT: vector.scevcheck -; AUTOVF-NOT: vector.body: +; AUTOVF: tc.check +; AUTOVF: br i1 true, label %scalar.ph, label %vector.scevcheck ; AUTOVF-LABEL: for.cond: define i32 @main() local_unnamed_addr { while.cond: diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -18,43 +18,41 @@ ; define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( -; CHECK-NEXT: tc.check: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !0 +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]), !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll --- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -1,14 +1,16 @@ -; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; PR39417 -; Check that the need for overflow check prevents vectorizing a loop with tiny -; trip count (which implies opt for size). -; CHECK-LABEL: @func_34 -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body: -; CHECK-LABEL: bb67: +; Check that the need for overflow check makes vectorization of a loop with tiny +; trip count not profitable. +; CHECK: LV: ScalarTotalCost = 21 +; CHECK-NEXT: LV: VecTotalCost = RTCost + (VecCost * VecTC) + (EpilogCost * EpilogTC) = 17 + (5 * 0) + (7 * 3) = 38 +; CHECK-NEXT: LV: It's not profitable to vectorize short trip count loop. + +; CHECK-LABEL: @func_34( +; CHECK-NEXT: tc.check: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] define void @func_34() { bb1: br label %bb67 diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll --- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll @@ -38,11 +38,35 @@ ; instead. define i64 @test1(i64 %y) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i64> , [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> , <4 x i64> [[TMP2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 3, 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: [[DIV:%.*]] = xor i64 3, [[Y]] @@ -51,9 +75,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2 ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -83,11 +107,34 @@ ; instead. define i64 @test2(i64 %y) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> , <4 x i64> +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 3, 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: br label [[COND_END]] @@ -95,9 +142,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5 ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -126,11 +173,32 @@ ; instead. define i32 @test3(i64 %y) { ; CHECK-LABEL: @test3( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND]], <4 x i32> +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 3, 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0 +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]] ; CHECK: cond.false: ; CHECK-NEXT: br label [[COND_END]] @@ -138,9 +206,9 @@ ; CHECK-NEXT: [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7 ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is not constant and its value is estimated by profile. + +; ModuleID = 'test.cpp' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [5 x i32] zeroinitializer, align 16 +@b = dso_local global [5 x i32] zeroinitializer, align 16 + +; CHECK: LV: Found trip count: 0 +; CHECK: LV: Checking cost of run-time overhead for short trip count loop. +; CHECK: LV: It's still profitable to vectorize short trip count loop. +; +; CHECK: LV: Found trip count: 5 +; CHECK: LV: Checking cost of run-time overhead for short trip count loop. +; CHECK: LV: It's still profitable to vectorize short trip count loop. +; +; Function Attrs: uwtable +define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 { +; CHECK-LABEL: @_Z3fooi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [5 x i32], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [5 x i32]* [[A]] to i8* +; CHECK-NEXT: [[B:%.*]] = alloca [5 x i32], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [5 x i32]* [[B]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [5 x i32]* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [5 x i32]* [[B]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP1]]) +; CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 0 +; CHECK-NEXT: br label [[FOR_BODY_US_PREHEADER:%.*]] +; CHECK: for.body.us.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M:%.*]] to i64 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; CHECK-NEXT: br label [[FOR_BODY_US:%.*]] +; CHECK: for.body.us: +; CHECK-NEXT: [[J_019_US:%.*]] = phi i32 [ [[INC8_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: call void @_Z3barPi(i32* nonnull [[ARRAYDECAY]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP0]], [[SCEVGEP23]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !tbaa !3, !alias.scope !7 +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND4]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !tbaa !3, !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD6]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa !3, !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] +; CHECK: for.body4.us: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[MUL_US:%.*]] = mul nsw i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX6_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[ADD_US:%.*]] = add nsw i32 [[TMP20]], [[MUL_US]] +; CHECK-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !prof !14, !llvm.loop !15 +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: +; CHECK-NEXT: [[INC8_US]] = add nuw nsw i32 [[J_019_US]], 1 +; CHECK-NEXT: [[EXITCOND21:%.*]] = icmp eq i32 [[INC8_US]], 20 +; CHECK-NEXT: br i1 [[EXITCOND21]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]], !prof !16 +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup.loopexit24: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP0]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is known constant value. + +; Function Attrs: uwtable +define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 { +; CHECK-LABEL: @_Z3fooi2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[J_018:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC8:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 and (i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i64 1, i64 0)), i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 1, i64 0))), true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa !3, !alias.scope !17 +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa !3, !alias.scope !20, !noalias !17 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa !3, !alias.scope !20, !noalias !17 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 5, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC8]] = add nuw nsw i32 [[J_018]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC8]], 1000 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !prof !23 +; CHECK: for.body4: +; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY4]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_017]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !3 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], [[I_017]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !3 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[MUL]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX6]], align 4, !tbaa !3 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_017]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 5 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP3]], !llvm.loop !24 +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.body: ; preds = %entry, %for.cond.cleanup3 + %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4 + %inc8 = add nuw nsw i32 %j.018, 1 + %cmp = icmp ult i32 %inc8, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13 + +for.body4: ; preds = %for.body, %for.body4 + %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ] + %idxprom = zext i32 %i.017 to i64 + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul = mul nsw i32 %0, %i.017 + %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2 + %add = add nsw i32 %1, %mul + store i32 %add, i32* %arrayidx6, align 4, !tbaa !2 + %inc = add nuw nsw i32 %i.017, 1 + %cmp2 = icmp ult i32 %inc, 5 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +declare dso_local void @_Z3barPi(i32*) local_unnamed_addr + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +attributes #0 = { "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.isvectorized", i32 1} +!8 = distinct !{!8, !9, !7} +!9 = !{!"llvm.loop.unroll.runtime.disable"} +!10 = !{!"branch_weights", i32 999, i32 4995} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 1, i32 999} +!13 = !{!"branch_weights", i32 1000, i32 1} +