diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -168,9 +168,9 @@ /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + unsigned Width = 0; // Cost of the loop with that width - unsigned Cost; + unsigned Cost = 0; // Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { return {1, 0}; } @@ -243,8 +243,9 @@ void setBestPlan(unsigned VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + /// best selected VPlan. Returns 'true' if we successfully generated vector + /// loop, 'false' otherwise. + bool executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); void printPlans(raw_ostream &O) { for (const auto &Plan : VPlans) @@ -285,6 +286,11 @@ /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF); + + /// Returns 'false' if additional overhead from generated runtime checks (trip + /// count, memory dependency and SCEV checks) makes vectorization not + /// profitable, 'true' otherwise. + bool mayDisregardRTChecksOverhead(InnerLoopVectorizer &ILV); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -294,6 +294,13 @@ "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +static cl::opt EnableTinyLoopVectorization( + "vectorize-tiny-loops-with-epilog", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of tiny loops even if run-time and/or " + "scalar iterations overhead are incuired. See " + "'vectorizer-min-trip-count' for more information on tiny " + "loops.")); + /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -362,7 +369,7 @@ /// 2) Returns expected trip count according to profile data if any. /// 3) Returns upper bound estimate if it is known. /// 4) Returns None if all of the above failed. -static Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { +Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { // Check if exact trip count is known. if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) return ExpectedTC; @@ -935,12 +942,6 @@ // Vectorization with OptForSize: don't allow epilogues. CM_ScalarEpilogueNotAllowedOptSize, - // A special case of vectorisation with OptForSize: loops with a very small - // trip count are considered for vectorization under OptForSize, thereby - // making sure the cost of their loop body is dominant, free of runtime - // guards and scalar iteration overheads. - CM_ScalarEpilogueNotAllowedLowTripLoop, - // Loop hint predicate indicating an epilogue is undesired. CM_ScalarEpilogueNotNeededUsePredicate }; @@ -973,7 +974,7 @@ /// \return True if runtime checks are required for vectorization, and false /// otherwise. - bool runtimeChecksRequired(); + bool runtimeChecksRequired(bool ReportFailure); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -1304,13 +1305,6 @@ /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); -private: - unsigned NumPredStores = 0; - - /// \return An upper bound for the vectorization factor, larger than zero. - /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); - /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -1320,16 +1314,23 @@ /// actually taken place). using VectorizationCostTy = std::pair; + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + +private: + unsigned NumPredStores = 0; + + /// \return An upper bound for the vectorization factor, larger than zero. + /// One is returned if vectorization should best be avoided due to cost. + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(unsigned VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); @@ -1511,6 +1512,11 @@ /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + + /// Cached {VF, Cost} for scalar loop (VF==1). + VectorizationFactor ScalarVF; + /// Cached {VF, Cost} for best expected vectorization mode. + VectorizationFactor BestVF; }; } // end namespace llvm @@ -4890,34 +4896,40 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::runtimeChecksRequired() { +bool LoopVectorizationCostModel::runtimeChecksRequired(bool ReportFailure) { LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { - reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", - "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime ptr check is required with -Os/-Oz", + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { - reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", - "runtime SCEV checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime SCEV check is required with -Os/-Oz", + "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", - "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); + if (ReportFailure) + reportVectorizationFailure( + "Runtime stride check is required with -Os/-Oz", + "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } @@ -4945,27 +4957,53 @@ } switch (ScalarEpilogueStatus) { - case CM_ScalarEpilogueAllowed: + case CM_ScalarEpilogueAllowed: { + auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + // Tiny loops are handled in a special way. + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { + if (EnableTinyLoopVectorization) { + // For tiny loops without runtime checks prefer masked vectorization + // to preserve legacy behavior. + // TODO: Ideally this decision should be done by cost model. + if (Hints->getForce() != LoopVectorizeHints::FK_Enabled && + !runtimeChecksRequired(false)) { + LLVM_DEBUG( + dbgs() << "LV: Prefer masked vectorization for short trip " + << "count loop.\n"); + break; + } + } else { + LLVM_DEBUG( + dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + LLVM_DEBUG(dbgs() << "\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueNotAllowedOptSize; + // Legacy behavior is to disable vectorization for tiny loops. + if (runtimeChecksRequired(true)) + return None; + break; + } + } + } return computeFeasibleMaxVF(TC); + } case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; - case CM_ScalarEpilogueNotAllowedLowTripLoop: - // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) LLVM_DEBUG( dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); - else - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " - << "count.\n"); // Bail if runtime checks are required, which are not good when optimising // for size. - if (runtimeChecksRequired()) + if (runtimeChecksRequired(true)) return None; break; } @@ -4989,10 +5027,22 @@ // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { + // Synchronize 'ScalarEpilogueStatus' with folding mode if required. + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed) + ScalarEpilogueStatus = CM_ScalarEpilogueNotAllowedOptSize; FoldTailByMasking = true; return MaxVF; } + // If scalar epilogue was not forbidden proceed with 'normal' vectorization. + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed) { + LLVM_DEBUG( + dbgs() << "LV: Masked vectorization is not allowed. Continue with" + "'normal' vectorization using epilogue\n"); + + return MaxVF; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -5088,10 +5138,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; - const float ScalarCost = Cost; + ScalarVF = { 1, expectedCost(1).first }; + float Cost = ScalarVF.Cost; unsigned Width = 1; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarVF.Cost + << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && MaxVF > 1) { @@ -5126,15 +5177,16 @@ "store that is conditionally executed prevents vectorization", "ConditionalStore", ORE, TheLoop); Width = 1; - Cost = ScalarCost; + Cost = ScalarVF.Cost; } - LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); + LLVM_DEBUG( + if (ForceVectorization && Width > 1 && Cost >= ScalarVF.Cost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; - return Factor; + BestVF = { Width, (unsigned)(Width * Cost) }; + return BestVF; } std::pair @@ -6561,7 +6613,120 @@ assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, +// Helper function to calculate cost of all instructions in the \p C. +template +static uint64_t getCostOfBlocks(LoopVectorizationCostModel &CM, + Container &&C) { + uint64_t TotalCost = 0; + for (BasicBlock *BB : C) { + for (Instruction &I : *BB) { + auto InstCost = CM.getInstructionCost(&I, 1).first; + TotalCost += InstCost; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << InstCost + << " for VF " << 1 << " For instruction: " << I + << '\n'); + } + } + return TotalCost; +} + +// Returns 'false' if additional overhead from generated runtime checks (trip +// count, memory dependency and SCEV checks) makes vectorization not profitable, +// 'true' otherwise. +bool LoopVectorizationPlanner::mayDisregardRTChecksOverhead( + InnerLoopVectorizer &ILV) { + Optional ExpectedTC = + getSmallBestKnownTC(*CM.PSE.getSE(), OrigLoop); + + // No need to check for RT overhead for loops expected not to have short + // trip count. + // TODO: This is done this was to preserve legacy behavior. We should change + // that eventually and be checking for RT overhead for all loops regardless of + // TC. + if (!ExpectedTC || *ExpectedTC >= TinyTripCountVectorThreshold) { + LLVM_DEBUG( + dbgs() << "LV: Disregarding run-time checks overhead: not short trip " + "count loop.\n"); + return true; + } + + // No need to check for RT overhead if vectorization was forced. Note that + // cost modeling still may be performed to select best VF. + if (CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) { + LLVM_DEBUG(dbgs() << "LV: Disregarding run-time checks overhead: " + "vectorization was forced.\n"); + return true; + } + + // No need to check for RT overhead if cost modeling was skipped and VF + // selected by the user. + if (CM.ScalarVF.Width == 0) { + LLVM_DEBUG( + dbgs() + << "LV: Disregarding run-time checks overhead: VF was forced.\n"); + return true; + } + + assert(CM.BestVF.Width != 0 && CM.BestVF.Width > 1 && + "Best VF was not properly selected?"); + + LLVM_DEBUG(dbgs() << "LV: Checking cost of run-time overhead for short " + "trip count loop.\n"); + + uint64_t VecTripCount = *ExpectedTC / CM.BestVF.Width; + uint64_t RemainderTripCount = *ExpectedTC % CM.BestVF.Width; + + // In "foldTailByMasking" mode remainder iterations are executed as part of + // the main vector loop. That means all remainder iterations will be executed + // as one masked vector iterations. + if (RemainderTripCount != 0 && CM.foldTailByMasking()) { + ++VecTripCount; + RemainderTripCount = 0; + } + + // In "requiresScalarEpilogue" mode there should be at least one iteration + // executed in the remainder loop. If all iterations end up being executed + // as part of the main vector loop forward one vector iteration to remainder + // loop. + if (RemainderTripCount == 0 && CM.requiresScalarEpilogue()) { + --VecTripCount; + RemainderTripCount = CM.BestVF.Width; + } + + uint64_t VecRTCost = getCostOfBlocks(CM, ILV.LoopBypassBlocks) + + getCostOfBlocks >( + CM, { ILV.LoopVectorPreHeader, ILV.LoopMiddleBlock, + ILV.LoopScalarPreHeader }); + uint64_t VecCost = CM.BestVF.Cost * VecTripCount; + uint64_t RemainderCost = CM.ScalarVF.Cost * RemainderTripCount; + uint64_t VecTotalCost = VecRTCost + VecCost + RemainderCost; + uint64_t ScalarTotalCost = CM.ScalarVF.Cost * (*ExpectedTC); + + LLVM_DEBUG(dbgs() << "LV: ScalarTotalCost = " << ScalarTotalCost << "\n"); + LLVM_DEBUG(dbgs() << "LV: VecTotalCost = RTCost + (VecCost * VecTC) + " + "(RemainderCost * RemainderTC) = " << VecRTCost << " + (" + << CM.BestVF.Cost << " * " << VecTripCount << ") + (" + << CM.ScalarVF.Cost << " * " << RemainderTripCount + << ") = " << VecTotalCost << "\n"); + + if (VecTotalCost >= ScalarTotalCost) { + LLVM_DEBUG( + dbgs() + << "LV: It's not profitable to vectorize short trip count loop.\n"); + + assert(isa(ILV.LoopBypassBlocks.front()->getTerminator()) && + "RT check should end with branch instruction."); + + return false; + } + + LLVM_DEBUG( + dbgs() + << "LV: It's still profitable to vectorize short trip count loop.\n"); + return true; +} + +bool LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, DominatorTree *DT) { // Perform the actual loop transformation. @@ -6582,13 +6747,35 @@ // //===------------------------------------------------===// - // 2. Copy and widen instructions from the old loop into the new loop. - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlans.front()->execute(&State); + bool IsVectorizationProfitable = mayDisregardRTChecksOverhead(ILV); + // Skip generation of vector body if vectorization turned out to be not + // profitable (vector loop is dead in this case). + if (IsVectorizationProfitable) { + // 2. Copy and widen instructions from the old loop into the new loop. + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); + VPlans.front()->execute(&State); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); + } else { + // Make vectorized loop effectively dead. Later optimizations should clean + // it up. + auto *BrInst = + cast(ILV.LoopBypassBlocks.front()->getTerminator()); + BrInst->setCondition( + ConstantInt::getTrue(BrInst->getCondition()->getType())); + + + ILV.ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Not Vectorized", + ILV.OrigLoop->getStartLoc(), + ILV.OrigLoop->getHeader()) + << "not profitable to vectorize short trip count loop."; + }); + } - // 3. Fix the vectorized code: take care of header phi's, live-outs, - // predication, updating analyses. - ILV.fixVectorizedLoop(); + return IsVectorizationProfitable; } void LoopVectorizationPlanner::collectTriviallyDeadInstructions( @@ -7584,7 +7771,9 @@ &CM); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(LB, DT); + bool IsVectorized = LVP.executePlan(LB, DT); + assert(IsVectorized && "VPlan failed to be executed in native path."); + (void)IsVectorized; // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); @@ -7663,21 +7852,6 @@ assert(L->empty() && "Inner loop expected."); - // Check the loop for a trip count threshold: vectorize loops with a tiny trip - // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); - if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { - LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); - if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); - else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; - } - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer @@ -7833,59 +8007,61 @@ LVP.setBestPlan(VF.Width, IC); using namespace ore; - bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); + std::unique_ptr ILV; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); - LVP.executePlan(Unroller, DT); - - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); + ILV = std::make_unique(L, PSE, LI, DT, TLI, TTI, AC, ORE, + IC, &LVL, &CM); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - - // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + ILV = std::make_unique(L, PSE, LI, DT, TLI, TTI, AC, + ORE, VF.Width, IC, &LVL, &CM); } - Optional RemainderLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID.hasValue()) { - L->setLoopID(RemainderLoopID.getValue()); - } else { - if (DisableRuntimeUnroll) - AddRuntimeUnrollDisableMetaData(L); + bool IsPlanExecuted = LVP.executePlan(*ILV, DT); - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); + if (IsPlanExecuted) { + if (!VectorizeLoop) { + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); + } else { + ++LoopsVectorized; + // Report the vectorization decision. + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); + } + + Optional RemainderLoopID = + makeFollowupLoopID(OrigLoopID, { LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupEpilogue }); + if (RemainderLoopID.hasValue()) { + L->setLoopID(RemainderLoopID.getValue()); + } else { + // Add metadata to disable runtime unrolling a scalar loop when there are + // no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (VectorizeLoop && !ILV->areSafetyChecksAdded()) + AddRuntimeUnrollDisableMetaData(L); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + } } + // IR could be changed even if 'IsPlanExecuted' is false. LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } diff --git a/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -S -debug-only=loop-vectorize -vectorize-tiny-loops-with-epilog=true < %s 2>&1 | FileCheck %s +; REQUIRES asserts + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is not constant and its value is estimated by profile. + +; ModuleID = 'test.cpp' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [5 x i32] zeroinitializer, align 16 +@b = dso_local global [5 x i32] zeroinitializer, align 16 + +; CHECK: LV: Found trip count: 0 +; CHECK: LV: Checking cost of run-time overhead for short trip count loop. +; CHECK: LV: It's still profitable to vectorize short trip count loop. +; +; CHECK: LV: Found trip count: 5 +; CHECK: LV: Checking cost of run-time overhead for short trip count loop. +; CHECK: LV: It's still profitable to vectorize short trip count loop. +; +; Function Attrs: uwtable +define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 { +; CHECK-LABEL: @_Z3fooi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [5 x i32], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [5 x i32]* [[A]] to i8* +; CHECK-NEXT: [[B:%.*]] = alloca [5 x i32], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [5 x i32]* [[B]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [5 x i32]* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [5 x i32]* [[B]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP1]]) +; CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 0 +; CHECK-NEXT: br label [[FOR_BODY_US_PREHEADER:%.*]] +; CHECK: for.body.us.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M:%.*]] to i64 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; CHECK-NEXT: br label [[FOR_BODY_US:%.*]] +; CHECK: for.body.us: +; CHECK-NEXT: [[J_019_US:%.*]] = phi i32 [ [[INC8_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: call void @_Z3barPi(i32* nonnull [[ARRAYDECAY]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP0]], [[SCEVGEP23]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !tbaa !3, !alias.scope !7 +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND4]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !tbaa !3, !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD6]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa !3, !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] +; CHECK: for.body4.us: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[MUL_US:%.*]] = mul nsw i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX6_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[ADD_US:%.*]] = add nsw i32 [[TMP20]], [[MUL_US]] +; CHECK-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]], align 4, !tbaa !3 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !prof !14, !llvm.loop !15 +; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: +; CHECK-NEXT: [[INC8_US]] = add nuw nsw i32 [[J_019_US]], 1 +; CHECK-NEXT: [[EXITCOND21:%.*]] = icmp eq i32 [[INC8_US]], 20 +; CHECK-NEXT: br i1 [[EXITCOND21]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]], !prof !16 +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup.loopexit24: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP0]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca [5 x i32], align 16 + %b = alloca [5 x i32], align 16 + %0 = bitcast [5 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3 + %1 = bitcast [5 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3 + %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0 + br label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + %wide.trip.count = zext i32 %M to i64 + br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader + %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ] + call void @_Z3barPi(i32* nonnull %arraydecay) + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.body.us + %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2 + %3 = trunc i64 %indvars.iv to i32 + %mul.us = mul nsw i32 %2, %3 + %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv + %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2 + %add.us = add nsw i32 %4, %mul.us + store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %inc8.us = add nuw nsw i32 %j.019.us, 1 + %exitcond21 = icmp eq i32 %inc8.us, 20 + br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12 + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit24: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3 + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3 + ret void +} + +; Check vectorization of hot short trip count with epilog. In this case inner +; loop trip count is known constant value. + +; Function Attrs: uwtable +define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 { +; CHECK-LABEL: @_Z3fooi2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[J_018:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC8:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 and (i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i64 1, i64 0)), i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 1, i64 0))), true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa !3, !alias.scope !17 +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa !3, !alias.scope !20, !noalias !17 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa !3, !alias.scope !20, !noalias !17 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 5, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY4:%.*]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC8]] = add nuw nsw i32 [[J_018]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC8]], 1000 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !prof !23 +; CHECK: for.body4: +; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY4]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_017]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !3 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], [[I_017]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !3 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[MUL]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX6]], align 4, !tbaa !3 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_017]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 5 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP3]], !llvm.loop !24 +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.body: ; preds = %entry, %for.cond.cleanup3 + %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0)) + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4 + %inc8 = add nuw nsw i32 %j.018, 1 + %cmp = icmp ult i32 %inc8, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13 + +for.body4: ; preds = %for.body, %for.body4 + %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ] + %idxprom = zext i32 %i.017 to i64 + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul = mul nsw i32 %0, %i.017 + %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2 + %add = add nsw i32 %1, %mul + store i32 %add, i32* %arrayidx6, align 4, !tbaa !2 + %inc = add nuw nsw i32 %i.017, 1 + %cmp2 = icmp ult i32 %inc, 5 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +declare dso_local void @_Z3barPi(i32*) local_unnamed_addr + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +attributes #0 = { "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.isvectorized", i32 1} +!8 = distinct !{!8, !9, !7} +!9 = !{!"llvm.loop.unroll.runtime.disable"} +!10 = !{!"branch_weights", i32 999, i32 4995} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 1, i32 999} +!13 = !{!"branch_weights", i32 1000, i32 1} +