Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -73,6 +73,14 @@ MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, + cl::desc("Limit the size of the SLP scheduling region per block")); + namespace { // FIXME: Set this via cl::opt to allow overriding. @@ -89,6 +97,10 @@ // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -720,6 +732,8 @@ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), + ScheduleRegionSize(0), + ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} @@ -731,6 +745,13 @@ FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + // Reduce the maximum schedule region size by the size of the + // previous scheduling run. + ScheduleRegionSizeLimit -= ScheduleRegionSize; + if (ScheduleRegionSizeLimit < MinScheduleRegionSize) + ScheduleRegionSizeLimit = MinScheduleRegionSize; + ScheduleRegionSize = 0; + // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; @@ -807,7 +828,8 @@ void cancelScheduling(ArrayRef VL); /// Extends the scheduling region so that V is inside the region. - void extendSchedulingRegion(Value *V); + /// \returns true if the region size is within the limit. + bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -861,6 +883,12 @@ /// (can be null). ScheduleData *LastLoadStoreInRegion; + /// The current size of the scheduling region. + int ScheduleRegionSize; + + /// The maximum size allowed for the scheduling region. + int ScheduleRegionSizeLimit; + /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; @@ -1080,7 +1108,9 @@ if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - BS.cancelScheduling(VL); + assert((!BS.getScheduleData(VL[0]) || + !BS.getScheduleData(VL[0])->isPartOfBundle()) && + "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } @@ -2686,8 +2716,15 @@ ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + + // Make sure that the scheduling region contains all + // instructions of the bundle. + for (Value *V : VL) { + if (!extendSchedulingRegion(V)) + return false; + } + for (Value *V : VL) { - extendSchedulingRegion(V); ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2748,7 +2785,11 @@ schedule(pickedSD, ReadyInsts); } } - return Bundle->isReady(); + if (!Bundle->isReady()) { + cancelScheduling(VL); + return false; + } + return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { @@ -2777,9 +2818,9 @@ } } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) - return; + return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); @@ -2790,7 +2831,7 @@ ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); - return; + return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. @@ -2799,12 +2840,17 @@ BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { + if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { + DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + return false; + } + if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); - return; + return true; } UpIter++; } @@ -2815,13 +2861,14 @@ ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); - return; + return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } + return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, Index: test/Transforms/SLPVectorizer/X86/schedule_budget.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/schedule_budget.ll +++ test/Transforms/SLPVectorizer/X86/schedule_budget.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Test if the budget for the scheduling region size works. +; We test with a reduced budget of 16 which should prevent vectorizing the loads. + +declare void @unknown() + +; CHECK-LABEL: @test +; CHECK: load float +; CHECK: load float +; CHECK: load float +; CHECK: load float +; CHECK: call void @unknown +define void @test(float * %a, float * %b) { +entry: + %l0 = load float, float* %a + %a1 = getelementptr inbounds float, float* %a, i64 1 + %l1 = load float, float* %a1 + %a2 = getelementptr inbounds float, float* %a, i64 2 + %l2 = load float, float* %a2 + %a3 = getelementptr inbounds float, float* %a, i64 3 + %l3 = load float, float* %a3 + + ; some unrelated instructions inbetween to enlarge the scheduling region + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + call void @unknown() + + store float %l0, float* %b + %b1 = getelementptr inbounds float, float* %b, i64 1 + store float %l1, float* %b1 + %b2 = getelementptr inbounds float, float* %b, i64 2 + store float %l2, float* %b2 + %b3 = getelementptr inbounds float, float* %b, i64 3 + store float %l3, float* %b3 + ret void +} +