diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -219,16 +219,9 @@ ExactFPMathInst = I; } - void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } - Instruction *getExactFPInst() { return ExactFPMathInst; } - unsigned getNumRuntimePointerChecks() const { - return NumRuntimePointerChecks; - } - private: - unsigned NumRuntimePointerChecks = 0; Instruction *ExactFPMathInst = nullptr; }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -993,7 +993,6 @@ } } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getPredicate()); return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -33,7 +33,6 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class LoopVectorizationRequirements; class LoopVectorizeHints; class OptimizationRemarkEmitter; class TargetTransformInfo; @@ -191,6 +190,10 @@ /// Cost of the scalar loop. InstructionCost ScalarCost; + /// The minimum trip count required to make vectorization profitable, e.g. due + /// to runtime checks. + ElementCount MinProfitableTripCount; + VectorizationFactor(ElementCount Width, InstructionCost Cost, InstructionCost ScalarCost) : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} @@ -268,8 +271,6 @@ const LoopVectorizeHints &Hints; - LoopVectorizationRequirements &Requirements; - OptimizationRemarkEmitter *ORE; SmallVector VPlans; @@ -285,10 +286,9 @@ InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {} + PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -196,10 +196,9 @@ "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt VectorizeMemoryCheckThreshold( + "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -442,6 +441,7 @@ const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) @@ -453,6 +453,11 @@ // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + + if (MinProfitableTripCount.isZero()) + this->MinProfitableTripCount = VecWidth; + else + this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; @@ -691,6 +696,8 @@ /// vector elements. ElementCount VF; + ElementCount MinProfitableTripCount; + /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -770,6 +777,7 @@ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} @@ -818,8 +826,8 @@ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, - Checks), + EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, + CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1931,14 +1939,17 @@ DominatorTree *DT; LoopInfo *LI; + TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; + bool CostTooHigh = false; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - const DataLayout &DL) - : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + TargetTransformInfo *TTI, const DataLayout &DL) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can @@ -1949,6 +1960,15 @@ void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + // Hard cutoff to limit compile-time increase in case a very large number of + // runtime checks needs to be generated. + // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to + // profile info. + CostTooHigh = + LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + if (CostTooHigh) + return; + BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -2020,6 +2040,44 @@ } } + InstructionCost getCost() { + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); + + if (CostTooHigh) { + InstructionCost Cost; + Cost.setInvalid(); + LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); + return Cost; + } + + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost + << "\n"); + + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2962,9 +3020,16 @@ // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - Value *Step = createStepForVF(Builder, CountTy, VF, UF); + auto CreateStep = [&]() { + // Create step with max(MinProTripCount, UF * VF). + if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) + return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + return createStepForVF(Builder, CountTy, VF, UF); + }; + if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + CheckMinIters = + Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an @@ -2976,8 +3041,9 @@ Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -3002,7 +3068,6 @@ } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -7460,14 +7525,6 @@ return VectorizationFactor::Disabled(); } -bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - return (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold && - !Hints.allowReordering()) || - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; -} - Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -10181,8 +10238,7 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10201,10 +10257,10 @@ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); @@ -10261,6 +10317,94 @@ } } +static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, Loop *L, + ScalarEvolution &SE) { + InstructionCost CheckCost = Checks.getCost(); + if (!CheckCost.isValid()) + return false; + + // When interleaving only scalar and vector cost will be equal, which in turn + // would lead to a divide by 0. Fall back to hard threshold. + if (VF.Width.isScalar()) { + if (CheckCost > VectorizeMemoryCheckThreshold) { + LLVM_DEBUG( + dbgs() + << "LV: Interleaving only is not profitable due to runtime checks\n"); + return false; + } + return true; + } + + // First, compute the minimum iteration count required so that the vector + // loop outperforms the scalar loop. + // The total cost of the scalar loop is + // ScalarC * TC + // where + // * TC is the actual trip count of the loop. + // * ScalarC is the cost of a single scalar iteration. + // + // The total cost of the vector loop is + // RtC + VecC * (TC / VF) + EpiC + // where + // * RtC is the cost of the generated runtime checks + // * VecC is the cost of a single vector iteration. + // * TC is the actual trip count of the loop + // * VF is the vectorization factor + // * EpiCost is the cost of the generated epilogue, including the cost + // of the remaining scalar operations. + // + // Vectorization is profitable once the total vector cost is less than the + // total scalar cost: + // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // + // Now we can compute the minimum required trip count TC as + // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // + // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that + // the computations are performed on doubles, not integers and the result + // is rounded up, hence we get an upper estimate of the TC. + unsigned IntVF = VF.Width.getKnownMinValue(); + double ScalarC = *VF.ScalarCost.getValue(); + double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; + double RtC = *CheckCost.getValue(); + double MinTC1 = RtC / (ScalarC - VecCOverVF); + + // Second, compute a minimum iteration count so that the cost of the + // runtime checks is only a fraction of the total scalar loop cost. This + // adds a loop-dependent bound on the overhead incurred if the runtime + // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC + // * TC. To bound the runtime check to be a fraction 1/X of the scalar + // cost, compute + // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + double MinTC2 = RtC * 10 / ScalarC; + + // Now pick the larger minimum. If it is not a multiple of VF, choose the + // next closest multiple of VF. This should partly compensate for ignoring + // the epilogue cost. + uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + + LLVM_DEBUG( + dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" + << VF.MinProfitableTripCount << "\n"); + + // Skip vectorization if the expected trip count is less than the minimum + // required trip count. + if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), + VF.MinProfitableTripCount)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " + "trip count < minimum profitable VF (" + << *ExpectedTC << " < " << VF.MinProfitableTripCount + << ")\n"); + + return false; + } + } + return true; +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -10418,8 +10562,7 @@ CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10431,21 +10574,9 @@ VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { - if (LVP.requiresTooManyRuntimeChecks()) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return false; - } VF = *MaybeVF; // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); @@ -10455,6 +10586,13 @@ // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (!ForceVectorization && + !areRuntimeChecksProfitable(Checks, VF, L, *PSE.getSE())) + return false; } // Identify the diagnostic messages that should be produced. @@ -10611,8 +10749,9 @@ if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,4 +1,5 @@ -; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime @@ -57,11 +58,13 @@ ret void } -; FIXME ; The trip count in the loop in this function high enough to warrant large runtime checks. ; CHECK-LABEL: define {{.*}} @test_tc_big_enough -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT: vector.memcheck +; DEFAULT: vector.body +; THRESHOLD-NOT: vector.memcheck +; THRESHOLD-NOT: vector.body +; define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) { entry: br label %loop @@ -112,8 +115,11 @@ define void @test_tc_unknown(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2, i64 %N) { ; CHECK-LABEL: define void @test_tc_unknown -; CHECK-NOT: vector.memcheck -; CHECK-NOT: vector.body +; DEFAULT: [[ADD:%.+]] = add i64 %N, 1 +; DEFAULT-NEXT: [[C:%.+]] = icmp ult i64 [[ADD]], 16 +; DEFAULT-NEXT: br i1 [[C]], label %scalar.ph, label %vector.memcheck +; THRESHOLD-NOT: vector.memcheck +; THRESHOLD-NOT: vector.body ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -10,9 +10,9 @@ ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -8,9 +8,9 @@ ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -102,9 +102,9 @@ ; CHECK-LABEL: @cond_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -10,9 +10,9 @@ ; CHECK-LABEL: @simple_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -63,9 +63,9 @@ ; CHECK-LABEL: @simple_memcpy( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -123,9 +123,9 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] ; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -193,9 +193,9 @@ ; CHECK-LABEL: @simple_gather_scatter( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -253,9 +253,9 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -312,9 +312,9 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_uniform_load( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -389,9 +389,9 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_store( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -445,9 +445,9 @@ ; CHECK-LABEL: @simple_fdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -507,9 +507,9 @@ ; CHECK-LABEL: @add_reduction_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -563,9 +563,9 @@ ; CHECK-LABEL: @add_reduction_f32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -617,9 +617,9 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 { ; CHECK-LABEL: @cond_xor_reduction( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -863,7 +863,7 @@ ; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 ; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 ; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s +; RUN: opt -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -10,11 +10,63 @@ ; Test case where the memory runtime checks and vector body is more expensive ; than running the scalar loop. -; TODO: should not be vectorized. define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) { + +; CHECK: Calculating cost of runtime checks: +; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %A, i64 16 +; CHECK-NEXT: 0 for {{.+}} = bitcast double* +; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %B, i64 16 +; CHECK-NEXT: 0 for {{.+}} = bitcast double* +; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %E, i64 16 +; CHECK-NEXT: 0 for {{.+}} = bitcast double* +; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %C, i64 16 +; CHECK-NEXT: 0 for {{.+}} = bitcast double* +; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %D, i64 16 +; CHECK-NEXT: 0 for {{.+}} = bitcast double* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = icmp ult i8* +; CHECK-NEXT: 1 for {{.+}} = and i1 +; CHECK-NEXT: 1 for {{.+}} = or i1 +; CHECK-NEXT: Total cost of runtime checks: 35 + +; CHECK: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 70) +; ; CHECK-LABEL: @test( -; CHECK: vector.memcheck -; CHECK: vector.body +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %for.body +; CHECK-NOT: vector.memcheck +; CHECK-NOT: vector.body ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -15,7 +15,7 @@ ; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16 ; CHECK-NEXT: [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)* ; CHECK-NEXT: [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 20 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1) diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -40,7 +40,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1 ; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[CONV3]], -1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 1 ; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, i64 addrspace(10)* [[DOTELT1]], align 8, !tbaa [[TBAA8]] ; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 28 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll --- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE -; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -8,20 +7,12 @@ ; First loop produced diagnostic pass remark. ;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) ; Second loop produces diagnostic analysis remark. -;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations - -; First loop produced diagnostic pass remark. -;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) -; Second loop produces diagnostic pass remark. -;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations +;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; We are vectorizing with 6 runtime checks. ;CHECK-LABEL: func1x6( ;CHECK: <4 x i32> ;CHECK: ret -;OVERRIDE-LABEL: func1x6( -;OVERRIDE: <4 x i32> -;OVERRIDE: ret define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body @@ -52,14 +43,10 @@ ret i32 undef } -; We are not vectorizing with 12 runtime checks. +; We are vectorizing with 12 runtime checks. ;CHECK-LABEL: func2x6( -;CHECK-NOT: <4 x i32> +;CHECK: <4 x i32> ;CHECK: ret -; We vectorize with 12 checks if a vectorization hint is provided. -;OVERRIDE-LABEL: func2x6( -;OVERRIDE-NOT: <4 x i32> -;OVERRIDE: ret define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body @@ -100,4 +87,3 @@ for.end: ; preds = %for.body ret i32 undef } -