Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -285,9 +285,10 @@ /// Allow emitting expensive instructions (such as divisions) when computing /// the trip count of a loop for runtime unrolling. bool AllowExpensiveTripCount; - /// Apply loop unroll on any kind of loop - /// (mainly to loops that fail runtime unrolling). - bool Force; + /// Apply loop unroll on any kind of loop (mainly to loops that + /// fail runtime unrolling). 0 disables while any other value is + /// the maximum allowed unrolling factor for forced unrolling. + unsigned ForceMaxCount; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -30,11 +30,12 @@ class OptimizationRemarkEmitter; class ScalarEvolution; -bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, - bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); +bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, + unsigned ForceMaxCount, bool AllowRuntime, + bool AllowExpensiveTripCount, unsigned TripMultiple, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -50,6 +50,8 @@ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + /// @} /// \name Vector TTI Implementations Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -238,6 +238,84 @@ return TTI::PSK_Software; } +void SystemZTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + // Find out if L contains a call, what the machine instruction count + // estimate is, and how many stores there are. + bool HasCall = false; + unsigned MICountEstimate = 0; + unsigned NumStores = 0; + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) { + Instruction *I = &*J; + if (isa(I) || isa(I)) { + ImmutableCallSite CS(I); + if (const Function *F = CS.getCalledFunction()) { + if (isLoweredToCall(F)) + HasCall = true; + if (F->getIntrinsicID() == Intrinsic::memcpy || + F->getIntrinsicID() == Intrinsic::memset) + NumStores++; + } + else // indirect call. + HasCall = true; + } + if (isa(J)) { + NumStores++; + Type *MemAccessTy = J->getOperand(0)->getType(); + if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) && + (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128)) + NumStores++; // 128 bit fp/int stores get split. + } + if (getUserCost(I) != + TargetTransformInfo::TargetCostConstants::TCC_Free) + MICountEstimate++; + } + } + + // The z13 processor will run out of store tags if too many stores + // are fed into it too quickly. Therefore make sure there are not + // too many stores in the resulting unrolled loop. + unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX); + + if (HasCall) { + // Only allow full unrolling if loop has any calls. + UP.FullUnrollMaxCount = Max; + UP.MaxCount = 1; + return; + } + + UP.MaxCount = Max; + if (UP.MaxCount <= 1) + return; + + // Allow partial and runtime trip count unrolling. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.Threshold; + + // Allow expensive instructions in the pre-header of the loop. + UP.AllowExpensiveTripCount = true; + + // If unrolling is forced (i.e. producing cloned iterations each + // with an exit branch), only do enough to get rid of tiny + // loops. + + // Compute a smallest count to make the loop at least 8 + // instructions. The estimate for number of MachineIntrs in loop is + // not always exact, but using a value of 12 gets ~99% of loops <= 8 + // MIs. + unsigned const MinSize = 12; + UP.ForceMaxCount = + ((MinSize % MICountEstimate) ? ((MinSize / MICountEstimate) + 1) + : (MinSize / MICountEstimate)); + // Don't do more than max 3 iterations + UP.ForceMaxCount = std::min(UP.ForceMaxCount, 3U); + // Don't go past the limit for number of stores per loop. + UP.ForceMaxCount = std::min(UP.ForceMaxCount, Max); +} + unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { if (!Vector) // Discount the stack pointer. Also leave out %r0, since it can't Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -128,7 +128,7 @@ UP.Runtime = false; UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; - UP.Force = false; + UP.ForceMaxCount = 0; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -709,7 +709,7 @@ if (UserUnrollCount) { UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; - UP.Force = true; + UP.ForceMaxCount = UINT_MAX; if (UP.AllowRemainder && (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold) return true; @@ -721,7 +721,7 @@ UP.Count = PragmaCount; UP.Runtime = true; UP.AllowExpensiveTripCount = true; - UP.Force = true; + UP.ForceMaxCount = UINT_MAX; if (UP.AllowRemainder && (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) return true; @@ -976,7 +976,7 @@ UP.Count = TripCount; // Unroll the loop. - if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, + if (!UnrollLoop(L, UP.Count, TripCount, UP.ForceMaxCount, UP.Runtime, UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; Index: lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- lib/Transforms/Utils/LoopUnroll.cpp +++ lib/Transforms/Utils/LoopUnroll.cpp @@ -201,11 +201,12 @@ /// /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and /// DominatorTree if they are non-null. -bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, - bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { +bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, + unsigned ForceMaxCount, bool AllowRuntime, + bool AllowExpensiveTripCount, unsigned TripMultiple, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -302,8 +303,10 @@ !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, UnrollRuntimeEpilog, LI, SE, DT, PreserveLCSSA)) { - if (Force) + if (ForceMaxCount > 1) { RuntimeTripCount = false; + Count = std::min(Count, ForceMaxCount); + } else return false; }